網路爬蟲之獲取圖片到本地
阿新 • • 發佈:2019-02-14
/*
* Created on Aug 26, 2011 2:41:26 PM
*
* HtmlSourceGetter.java
*
* NOTICE OF PROPRIETARY RIGHTS
*
* This program is a confidential trade secret and the property of author. Use, examination,
* reproduction, disassembly, decompiling, transfer and/or disclosure to others of
* all or any part of this software program are strictly prohibited except by express
* written agreement with author.
*
* --------------------------------------------------------------------------------------
* Modification History
* Date Author Version Description
* Aug 26, 2011 Cross 1.0 New
* --------------------------------------------------------------------------------------
*/
package com.cross.tools;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.SimpleNodeIterator;
public class HtmlSourceGetter {
private static HttpURLConnection con = null;
private static BufferedInputStream bis = null;
private static OutputStream out = null;
public static void getSource(String url) {
public static void parseHTML(String url, String keyword) {
private static void processNodeList(NodeList list, String keyword) {
public static void extractLinks(String url) {
try {
Parser parser = new Parser(url);
parser.setEncoding("UTF-8");
// frame filter
NodeFilter frameFilter = new NodeFilter() {
@Override
public boolean accept(Node node) {
if(node.getText().startsWith("frame src=")) {
return true;
}
return false;
}
};
// image filter;
NodeFilter imageFilter = new NodeClassFilter(ImageTag.class);
// href filter;
NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
// link or image filter
// OrFilter orFilter = new OrFilter(new NodeClassFilter(LinkTag.class),new NodeClassFilter(ImageTag.class));
// link or image or frame filter
// OrFilter allFilter = new OrFilter(orFilter,frameFilter);
NodeList nodeList = parser.extractAllNodesThatMatch(imageFilter);
for (int i = 0; i < nodeList.size(); i++) {
Node tag = nodeList.elementAt(i);
// <a href> tag
// if(tag instanceof LinkTag) {
// LinkTag link = (LinkTag)tag;
// String linkURL = link.getLink();
// String linkText = link.getLinkText();
// System.out.println("linkURL:"+linkURL);
// System.out.println("linkText:"+linkText);
// }
// <img src> tag
// else if(tag instanceof ImageTag) {
ImageTag image = (ImageTag)tag;
String imageURL = image.getImageURL();
String imageText = image.getText();
System.out.println("imageURL:"+imageURL);
System.out.println("imageText:"+imageText);
con = (HttpURLConnection)(new URL(imageURL).openConnection());
con.connect();
bis = new BufferedInputStream(con.getInputStream());
out = new FileOutputStream(new File("c:/cross/" + i + "_" +System.currentTimeMillis() +imageURL.substring(imageURL.lastIndexOf("."))));
byte[] buf = new byte[1024];
int size = 0;
while((size = bis.read(buf)) != -1){
out.write(buf, 0, size);
}
// out.flush();
// } else { // <frame src> tag eg:<frame src="test.html"/>
// String frame = tag.getText();
// String frameURL = frame.split("\"")[1];
// System.out.println("frameURL:"+frameURL);
//
// }
}
} catch (Exception e) {
System.err.println(e.getStackTrace());
} finally {
try {
out.close();
bis.close();
con.disconnect();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static void main(String[] args) {
// HtmlSourceGetter.parseHTML("http://localhost:8080/test/", "@");
// HtmlSourceGetter.parseHTML("http://localhost:8080/test/", "img");
HtmlSourceGetter.extractLinks("http://localhost:8080/");
// HtmlSourceGetter.extractLinks("http://localhost:8080/test/");
}
}
* Created on Aug 26, 2011 2:41:26 PM
*
* HtmlSourceGetter.java
*
* NOTICE OF PROPRIETARY RIGHTS
*
* This program is a confidential trade secret and the property of author. Use, examination,
* reproduction, disassembly, decompiling, transfer and/or disclosure to others of
* all or any part of this software program are strictly prohibited except by express
* written agreement with author.
*
* --------------------------------------------------------------------------------------
* Modification History
* Date Author Version Description
* Aug 26, 2011 Cross 1.0 New
* --------------------------------------------------------------------------------------
*/
package com.cross.tools;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.SimpleNodeIterator;
public class HtmlSourceGetter {
private static HttpURLConnection con = null;
private static BufferedInputStream bis = null;
private static OutputStream out = null;
public static void getSource(String url) {
public static void parseHTML(String url, String keyword) {
private static void processNodeList(NodeList list, String keyword) {
public static void extractLinks(String url) {
try {
Parser parser = new Parser(url);
parser.setEncoding("UTF-8");
// frame filter
NodeFilter frameFilter = new NodeFilter() {
@Override
public boolean accept(Node node) {
if(node.getText().startsWith("frame src=")) {
return true;
}
return false;
}
};
// image filter;
NodeFilter imageFilter = new NodeClassFilter(ImageTag.class);
// href filter;
NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
// link or image filter
// OrFilter orFilter = new OrFilter(new NodeClassFilter(LinkTag.class),new NodeClassFilter(ImageTag.class));
// link or image or frame filter
// OrFilter allFilter = new OrFilter(orFilter,frameFilter);
NodeList nodeList = parser.extractAllNodesThatMatch(imageFilter);
for (int i = 0; i < nodeList.size(); i++) {
Node tag = nodeList.elementAt(i);
// <a href> tag
// if(tag instanceof LinkTag) {
// LinkTag link = (LinkTag)tag;
// String linkURL = link.getLink();
// String linkText = link.getLinkText();
// System.out.println("linkURL:"+linkURL);
// System.out.println("linkText:"+linkText);
// }
// <img src> tag
// else if(tag instanceof ImageTag) {
ImageTag image = (ImageTag)tag;
String imageURL = image.getImageURL();
String imageText = image.getText();
System.out.println("imageURL:"+imageURL);
System.out.println("imageText:"+imageText);
con = (HttpURLConnection)(new URL(imageURL).openConnection());
con.connect();
bis = new BufferedInputStream(con.getInputStream());
out = new FileOutputStream(new File("c:/cross/" + i + "_" +System.currentTimeMillis() +imageURL.substring(imageURL.lastIndexOf("."))));
byte[] buf = new byte[1024];
int size = 0;
while((size = bis.read(buf)) != -1){
out.write(buf, 0, size);
}
// out.flush();
// } else { // <frame src> tag eg:<frame src="test.html"/>
// String frame = tag.getText();
// String frameURL = frame.split("\"")[1];
// System.out.println("frameURL:"+frameURL);
//
// }
}
} catch (Exception e) {
System.err.println(e.getStackTrace());
} finally {
try {
out.close();
bis.close();
con.disconnect();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static void main(String[] args) {
// HtmlSourceGetter.parseHTML("http://localhost:8080/test/", "@");
// HtmlSourceGetter.parseHTML("http://localhost:8080/test/", "img");
HtmlSourceGetter.extractLinks("http://localhost:8080/");
// HtmlSourceGetter.extractLinks("http://localhost:8080/test/");
}
}