Java正則匹配之 淘寶資訊爬取
阿新 • • 發佈:2020-09-08
爬取頁面分析:
1)每件商品以?開頭,以?結尾
2) 商品中有用的資料為
一、工具類 DataCenter
package cn.kgc.regex.file; import java.io.*; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class DataCenter { // 每一個商品中有一個list的資料 private List<List<String>> list = newArrayList<>(); // 檔案解析到String public StringBuilder combine(String file){ BufferedReader br = null; StringBuilder builder = new StringBuilder(); try { br = new BufferedReader(new FileReader(file)); String line = null; while (null!= (line=br.readLine())){ if((line=line.trim()).length()==0){ continue; } builder.append(line); } } catch (Exception e) { e.printStackTrace(); return null; } finally {if (null != br) { try { br.close(); } catch (IOException e) { e.printStackTrace(); } } } return builder; } // 頁面篩選之 商品篩選 public List<String> split(String regex, String src){ List<String> list = new ArrayList<>(); Matcher matcher = Pattern.compile(regex).matcher(src); while (matcher.find()){ list.add(matcher.group()); } return list; } // 商品解析 public void parse(String regex,String src){ List<String> list = new ArrayList<>(4); Matcher matcher = Pattern.compile(regex).matcher(src); if (matcher.find()) { list.add(matcher.group(1)); list.add(matcher.group(2)); list.add(matcher.group(3)); list.add(matcher.group(4)); list.add(matcher.group(5)); list.add(matcher.group(6)); list.add(matcher.group(7)); } this.list.add(list); } // 當作狀態值,為了判斷是否解析結束 public int getCount(){ return list.size(); } // 遍歷輸出展示 public void foreach(){ for (List<String> strs : list) { for (String str : strs) { System.out.print(str+"\t"); } System.out.println(); } } }
二、App類
package cn.kgc.regex; import cn.kgc.regex.file.DataCenter; import java.util.Iterator; import java.util.List; /** * Hello world! * */ public class App { public static void main( String[] args ) throws InterruptedException { DataCenter dc = new DataCenter(); // 1. 檔案流讀取到String中 String src = dc.combine("D:\\JAVA學習資料\\Java正則爬取資訊\\regexdemo\\src\\main\\java\\cn\\kgc\\regex\\file\\tb-shirt.html") .toString(); // 2.regex1: 每一件商品,都在這種結構中 final String regex = "<div class=\"item J_MouserOnverReq .*?</div></div></div></div>"; // regex2:每一件商品中各種型別的資料在(.*?)中 final String regex2 = ".*?trace-nid=\"(.*?)\" .*? trace-price=\"(.*?)\" .*?><img.*?src=\"(.*?)\".*?alt=\"(.*?)\".*?<div class=\"deal-cnt\">(.*?)</div>.*?<div class=\"location\">(.*?)</div>.*?data-nick=\"(.*?)\".*?"; // list的size = 商品數量 List<String> split = dc.split(regex, src); // for (String s : split) { // System.out.println(s); // } Iterator<String> it = split.iterator(); while (it.hasNext()) { final String line = it.next(); // 3. 遍歷商品,每一行(件)開一個執行緒匹配 new Thread(()->{ dc.parse(regex2,line); }).start(); } // 4. 狀態值判定執行緒結束 while (dc.getCount()<split.size()){ Thread.sleep(200); } // 5. 多執行緒讀取結束後遍歷取值 dc.foreach(); } }