java呼叫百度搜索+Jsoup實現網路資源收集
阿新 • • 發佈:2019-02-08
Jsoup核心jar包:Jsoup核心jar包下載地址
java程式碼:
抽象搜尋資源的實體:Webpage
package com.sinosoft.lhresource.search.common;
public class Webpage {
// 標題
private String title;
// 連結
private String url;
// 簡介
private String summary;
// 正文內容
private String content;
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getSummary() {
return summary;
}
public void setSummary (String summary) {
this.summary = summary;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
}
通過資源連接獲取資源內容:TextExtract.java;Tools.java
package com.sinosoft.lhresource.search.common;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class TextExtract {
private static final Logger LOG = LoggerFactory.getLogger(TextExtract.class);
private static List<String> lines;
private final static int blocksWidth;
private static int threshold;
private static String html;
private static boolean flag;
private static int start;
private static int end;
private static StringBuilder text;
private static ArrayList<Integer> indexDistribution;
static {
lines = new ArrayList<>();
indexDistribution = new ArrayList<>();
text = new StringBuilder();
blocksWidth = 3;
flag = false;
/* 當待抽取的網頁正文中遇到成塊的新聞標題未剔除時,只要增大此閾值即可。*/
/* 閾值增大,準確率提升,召回率下降;值變小,噪聲會大,但可以保證抽到只有一句話的正文 */
threshold = 86;
}
public static void setthreshold(int value) {
threshold = value;
}
/**
* 抽取網頁正文,不判斷該網頁是否是目錄型。即已知傳入的肯定是可以抽取正文的主題類網頁。
*
* @param _html 網頁HTML字串
*
* @return 網頁正文string
*/
public static String parse(String _html) {
return parse(_html, false);
}
/**
* 判斷傳入HTML,若是主題類網頁,則抽取正文;否則輸出<b>"unkown"</b>。
*
* @param _html 網頁HTML字串
* @param _flag true進行主題類判斷, 省略此引數則預設為false
*
* @return 網頁正文string
*/
public static String parse(String _html, boolean _flag) {
flag = _flag;
html = _html;
preProcess();
LOG.debug(html);
return getText();
}
private static void preProcess() {
html = html.replaceAll("(?is)<!DOCTYPE.*?>", "");
html = html.replaceAll("(?is)<!--.*?-->", ""); // remove html comment
html = html.replaceAll("(?is)<script.*?>.*?</script>", ""); // remove javascript
html = html.replaceAll("(?is)<style.*?>.*?</style>", ""); // remove css
html = html.replaceAll("&.{2,5};|&#.{2,5};", " "); // remove special char
html = html.replaceAll("(?is)<.*?>", "");
//<!--[if !IE]>|xGv00|9900d21eb16fa4350a3001b3974a9415<![endif]-->
}
private static String getText() {
lines = Arrays.asList(html.split("\n"));
indexDistribution.clear();
for (int i = 0; i < lines.size() - blocksWidth; i++) {
int wordsNum = 0;
for (int j = i; j < i + blocksWidth; j++) {
lines.set(j, lines.get(j).replaceAll("\\s+", ""));
wordsNum += lines.get(j).length();
}
indexDistribution.add(wordsNum);
LOG.debug(wordsNum + "");
}
start = -1;
end = -1;
boolean boolstart = false, boolend = false;
text.setLength(0);
for (int i = 0; i < indexDistribution.size() - 1; i++) {
if (indexDistribution.get(i) > threshold && !boolstart) {
if (indexDistribution.get(i + 1).intValue() != 0
|| indexDistribution.get(i + 2).intValue() != 0
|| indexDistribution.get(i + 3).intValue() != 0) {
boolstart = true;
start = i;
continue;
}
}
if (boolstart) {
if (indexDistribution.get(i).intValue() == 0
|| indexDistribution.get(i + 1).intValue() == 0) {
end = i;
boolend = true;
}
}
StringBuilder tmp = new StringBuilder();
if (boolend) {
LOG.debug(start + 1 + "\t\t" + end + 1);
for (int ii = start; ii <= end; ii++) {
if (lines.get(ii).length() < 5) {
continue;
}
tmp.append(lines.get(ii)).append("\n");
}
String str = tmp.toString();
LOG.debug(str);
if (str.contains("Copyright") || str.contains("版權所有")) {
continue;
}
text.append(str);
boolstart = boolend = false;
}
}
return text.toString();
}
}
package com.sinosoft.lhresource.search.common;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.URL;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class Tools {
private static final Logger LOG = LoggerFactory.getLogger(Tools.class);
public static String getHTMLContent(String url) {
return getHTMLContent(url, "utf-8");
}
public static String getHTMLContent(String url, String encoding) {
try {
BufferedReader reader = new BufferedReader(new InputStreamReader(new URL(url).openStream(),encoding));
StringBuilder html = new StringBuilder();
String line = reader.readLine();
while (line != null) {
html.append(line).append("\n");
line = reader.readLine();
}
String content = TextExtract.parse(html.toString());
return content;
} catch (Exception e) {
LOG.debug("解析URL失敗:" + url, e);
}
return null;
}
public static void copyFile(InputStream in, File outFile){
OutputStream out = null;
try {
byte[] data=readAll(in);
out = new FileOutputStream(outFile);
out.write(data, 0, data.length);
out.close();
} catch (IOException ex) {
LOG.error("檔案操作失敗",ex);
} finally {
try {
if(in!=null){
in.close();
}
} catch (IOException ex) {
LOG.error("檔案操作失敗",ex);
}
try {
if(out!=null){
out.close();
}
} catch (IOException ex) {
LOG.error("檔案操作失敗",ex);
}
}
}
public static byte[] readAll(InputStream in) {
ByteArrayOutputStream out = new ByteArrayOutputStream();
try {
byte[] buffer = new byte[1024];
for (int n; (n = in.read(buffer)) > 0;) {
out.write(buffer, 0, n);
}
} catch (IOException e) {
LOG.error("讀取失敗", e);
}
return out.toByteArray();
}
}
自定義檢索介面:Searcher.java
package com.sinosoft.lhresource.search.common;
import java.util.List;
public interface Searcher {
public List<Webpage> search(String keyword);
public List<Webpage> search(String keyword, int page);
}
自定義處理百度檢索介面:BaiduSearcher.java
package com.sinosoft.lhresource.search.common;
import java.util.List;
public interface BaiduSearcher extends Searcher {
/**
* 新聞搜尋
* @param keyword
* @return
*/
public List<Webpage> searchNews(String keyword);
/**
* 新聞搜尋(分頁)
* @param keyword
* @param page
* @return
*/
public List<Webpage> searchNews(String keyword, int page);
/**
* 貼吧搜尋
* @param keyword
* @return
*/
public List<Webpage> searchTieba(String keyword);
/**
* 貼吧搜尋(分頁)
* @param keyword
* @param page
* @return
*/
public List<Webpage> searchTieba(String keyword, int page);
/**
* 知道搜尋
* @param keyword
* @return
*/
public List<Webpage> searchZhidao(String keyword);
/**
* 知道搜尋(分頁)
* @param keyword
* @param page
* @return
*/
public List<Webpage> searchZhidao(String keyword, int page);
/**
* 文庫搜尋
* @param keyword
* @return
*/
public List<Webpage> searchWenku(String keyword);
/**
* 文庫搜尋(分頁)
* @param keyword
* @param page
* @return
*/
public List<Webpage> searchWenku(String keyword, int page);
}
package com.sinosoft.lhresource.search.common;
import java.util.List;
public abstract class AbstractBaiduSearcher implements BaiduSearcher {
/**
* 新聞搜尋
* @param keyword
* @return
*/
@Override
public List<Webpage> searchNews(String keyword){
return searchNews(keyword, 1);
}
/**
* 新聞搜尋(分頁)
* @param keyword
* @param page
* @return
*/
@Override
public List<Webpage> searchNews(String keyword, int page){
throw new RuntimeException("未實現");
}
/**
* 貼吧搜尋
* @param keyword
* @return
*/
@Override
public List<Webpage> searchTieba(String keyword){
return searchTieba(keyword, 1);
}
/**
* 貼吧搜尋(分頁)
* @param keyword
* @param page
* @return
*/
@Override
public List<Webpage> searchTieba(String keyword, int page){
throw new RuntimeException("未實現");
}
/**
* 知道搜素
* @param keyword
* @return
*/
@Override
public List<Webpage> searchZhidao(String keyword){
return searchZhidao(keyword, 1);
}
/**
* 知道搜尋(分頁)
* @param keyword
* @param page
* @return
*/
@Override
public List<Webpage> searchZhidao(String keyword, int page){
throw new RuntimeException("未實現");
}
/**
* 文庫搜尋
* @param keyword
* @return
*/
@Override
public List<Webpage> searchWenku(String keyword){
return searchWenku(keyword, 1);
}
/**
* 文庫搜尋(分頁)
* @param keyword
* @param page
* @return
*/
@Override
public List<Webpage> searchWenku(String keyword, int page){
throw new RuntimeException("未實現");
}
}
百度搜索+Jsoup實現資源收集:JSoupBaiduSearcher.java
package com.sinosoft.lhresource.search.common;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class JSoupBaiduSearcher extends AbstractBaiduSearcher {
private static final Logger LOG = LoggerFactory.getLogger(JSoupBaiduSearcher.class);
@Override
public List<Webpage> search(String keyword) {
return search(keyword, 1);
}
@Override
public List<Webpage> search(String keyword, int page) {
int pageSize = 10;
//百度搜索結果每頁大小為10,pn引數代表的不是頁數,而是返回結果的開始數
//如獲取第一頁則pn=0,第二頁則pn=10,第三頁則pn=20,以此類推,抽象出模式:(page-1)*pageSize
String url = "http://www.baidu.com/s?pn="+(page-1)*pageSize+"&wd="+keyword;
// SearchResult searchResult = new SearchResult();
// searchResult.setPage(page);
List<Webpage> webpages = new ArrayList<>();
try {
Document document = Jsoup.connect(url).get();
//獲取搜尋結果數目
int total = getBaiduSearchResultCount(document);
// searchResult.setTotal(total);
int len = 10;
if (total < 1) {
return null;
}
//如果搜尋到的結果不足一頁
if (total < 10) {
len = total;
}
for (int i = 0; i < len; i++) {
String titleCssQuery = "html body div div div div#content_left div#" + (i + 1 + (page-1)*pageSize) + ".result.c-container h3.t a";
String summaryCssQuery = "html body div div div div#content_left div#" + (i + 1 + (page-1)*pageSize) + ".result.c-container div.c-abstract";
LOG.debug("titleCssQuery:" + titleCssQuery);
LOG.debug("summaryCssQuery:" + summaryCssQuery);
Element titleElement = document.select(titleCssQuery).first();
String href = "";
String titleText = "";
if(titleElement != null){
titleText = titleElement.text();
href = titleElement.attr("href");
}else{
//處理百度百科
titleCssQuery = "html body div#out div#in div#wrapper div#container div#content_left div#1.result-op h3.t a";
summaryCssQuery = "html body div#out div#in div#wrapper div#container div#content_left div#1.result-op div p";
LOG.debug("處理百度百科 titleCssQuery:" + titleCssQuery);
LOG.debug("處理百度百科 summaryCssQuery:" + summaryCssQuery);
titleElement = document.select(titleCssQuery).first();
if(titleElement != null){
titleText = titleElement.text();
href = titleElement.attr("href");
}
}
LOG.debug(titleText);
Element summaryElement = document.select(summaryCssQuery).first();
//處理百度知道
if(summaryElement == null){
summaryCssQuery = summaryCssQuery.replace("div.c-abstract","font");
LOG.debug("處理百度知道 summaryCssQuery:" + summaryCssQuery);
summaryElement = document.select(summaryCssQuery).first();
}
String summaryText = "";
if(summaryElement != null){
summaryText = summaryElement.text();
}
LOG.debug(summaryText);
if (titleText != null && !"".equals(titleText.trim()) && summaryText != null && !"".equals(summaryText.trim())) {
Webpage webpage = new Webpage();
webpage.setTitle(titleText);
webpage.setUrl(href);
webpage.setSummary(summaryText);
/*if (href != null) {
String content = Tools.getHTMLContent(href);
webpage.setContent(content);
} else {
LOG.info("頁面正確提取失敗");
}*/
webpages.add(webpage);
} else {
LOG.error("獲取搜尋結果列表項出錯:" + titleText + " - " + summaryText);
}
}
} catch (IOException ex) {
LOG.error("搜尋出錯",ex);
}
// searchResult.setWebpages(webpages);
return webpages;
}
/**
* 獲取百度搜索結果數
* 獲取如下文字並解析數字:
* 百度為您找到相關結果約13,200個
* @param document 文件
* @return 結果數
*/
private int getBaiduSearchResultCount(Document document){
String cssQuery = "html body div div div div.nums";
LOG.debug("total cssQuery: " + cssQuery);
Element totalElement = document.select(cssQuery).first();
String totalText = totalElement.text();
LOG.info("搜尋結果文字:" + totalText);
String regEx="[^0-9]";
Pattern pattern = Pattern.compile(regEx);
Matcher matcher = pattern.matcher(totalText);
totalText = matcher.replaceAll("");
int total = Integer.parseInt(totalText);
LOG.info("搜尋結果數:" + total);
return total;
}
public static void main(String[] args) {
Searcher searcher = new JSoupBaiduSearcher();
List<Webpage> webpages = searcher.search("六扇門",2);
if (webpages != null) {
int i = 2;
LOG.info("搜尋結果 當前第 " + 1 + " 頁,頁面大小為:" + webpages.size() + " 共有結果數:" + webpages.size());
for (Webpage webpage : webpages) {
LOG.info("搜尋結果 " + (i++) + " :");
LOG.info("標題:" + webpage.getTitle());
LOG.info("URL:" + webpage.getUrl());
LOG.info("摘要:" + webpage.getSummary());
LOG.info("正文:" + webpage.getContent());
LOG.info("");
}
} else {
LOG.error("沒有搜尋到結果");
}
}
}