1. 程式人生 > 實用技巧 >Java正則匹配之 淘寶資訊爬取

Java正則匹配之 淘寶資訊爬取

爬取頁面分析:

1)每件商品以?開頭,以?結尾

2) 商品中有用的資料為

一、工具類 DataCenter

package cn.kgc.regex.file;

import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class DataCenter {
    // 每一個商品中有一個list的資料
    private List<List<String>> list = new
ArrayList<>(); // 檔案解析到String public StringBuilder combine(String file){ BufferedReader br = null; StringBuilder builder = new StringBuilder(); try { br = new BufferedReader(new FileReader(file)); String line = null; while (null
!= (line=br.readLine())){ if((line=line.trim()).length()==0){ continue; } builder.append(line); } } catch (Exception e) { e.printStackTrace(); return null; } finally {
if (null != br) { try { br.close(); } catch (IOException e) { e.printStackTrace(); } } } return builder; } // 頁面篩選之 商品篩選 public List<String> split(String regex, String src){ List<String> list = new ArrayList<>(); Matcher matcher = Pattern.compile(regex).matcher(src); while (matcher.find()){ list.add(matcher.group()); } return list; } // 商品解析 public void parse(String regex,String src){ List<String> list = new ArrayList<>(4); Matcher matcher = Pattern.compile(regex).matcher(src); if (matcher.find()) { list.add(matcher.group(1)); list.add(matcher.group(2)); list.add(matcher.group(3)); list.add(matcher.group(4)); list.add(matcher.group(5)); list.add(matcher.group(6)); list.add(matcher.group(7)); } this.list.add(list); } // 當作狀態值,為了判斷是否解析結束 public int getCount(){ return list.size(); } // 遍歷輸出展示 public void foreach(){ for (List<String> strs : list) { for (String str : strs) { System.out.print(str+"\t"); } System.out.println(); } } }

二、App類

package cn.kgc.regex;

import cn.kgc.regex.file.DataCenter;

import java.util.Iterator;
import java.util.List;

/**
 * Hello world!
 *
 */
public class App 
{
    public static void main( String[] args ) throws InterruptedException {
        DataCenter dc = new DataCenter();
        // 1. 檔案流讀取到String中
        String src = dc.combine("D:\\JAVA學習資料\\Java正則爬取資訊\\regexdemo\\src\\main\\java\\cn\\kgc\\regex\\file\\tb-shirt.html")
                .toString();
        // 2.regex1: 每一件商品,都在這種結構中
        final String regex = "<div class=\"item J_MouserOnverReq .*?</div></div></div></div>";
        //   regex2:每一件商品中各種型別的資料在(.*?)中
        final String regex2 = ".*?trace-nid=\"(.*?)\" .*? trace-price=\"(.*?)\" .*?><img.*?src=\"(.*?)\".*?alt=\"(.*?)\".*?<div class=\"deal-cnt\">(.*?)</div>.*?<div class=\"location\">(.*?)</div>.*?data-nick=\"(.*?)\".*?";

        // list的size = 商品數量
        List<String> split = dc.split(regex, src);
//        for (String s : split) {
//            System.out.println(s);
//        }

        Iterator<String> it = split.iterator();
        while (it.hasNext()) {
            final String line = it.next();
            // 3. 遍歷商品,每一行(件)開一個執行緒匹配
            new Thread(()->{
                dc.parse(regex2,line);
            }).start();
        }
            // 4. 狀態值判定執行緒結束
        while (dc.getCount()<split.size()){
            Thread.sleep(200);
        }
           // 5. 多執行緒讀取結束後遍歷取值
        dc.foreach();
    }
}