1. 程式人生 > >爬取href超連結,正則指定目標結果

爬取href超連結,正則指定目標結果

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Test2 {  
	private Pattern pattern2;
	public Test2() {
		// TODO Auto-generated constructor stub
		pattern2=Pattern.compile("<span[^>]+>(主樓[^<]*)</span>");
	}
	public void download(String string) throws IOException
	{
		URL url=new URL(string);
		HttpURLConnection httpURLConnection=(HttpURLConnection) url.openConnection();
		BufferedReader bufferedReader=new BufferedReader(new InputStreamReader(httpURLConnection.getInputStream(),"utf8"));
		String dst="content.txt";
		
		BufferedWriter bufferedWriter=new BufferedWriter(new FileWriter(dst));
		StringBuffer content=new StringBuffer();
		String line=null;
		while ((line=bufferedReader.readLine())!=null) {
			content.append(line);
		}
		bufferedWriter.write(content.toString());
		bufferedWriter.close();
		bufferedReader.close();
		
	}
	public void visit(String string,String name) throws IOException
	{
		URL url=new URL(string);
		HttpURLConnection httpURLConnection=(HttpURLConnection) url.openConnection();
		BufferedReader bufferedReader=new BufferedReader(new InputStreamReader(httpURLConnection.getInputStream(),"utf-8"));
		StringBuffer content=new StringBuffer();
		String line=null;
		while ((line=bufferedReader.readLine())!=null) {
			content.append(line);
		}
		bufferedReader.close();
		
		// <span style="mso-spacerun:'yes'; font-size:10.5000pt; font-family:'楷體_GB2312'; ">主樓二區137</span>
		// <span style="mso-spacerun:'yes'; font-size:14.0000pt; font-family:'楷體_GB2312'; ">導師姓名</span>
		Matcher res = pattern2.matcher(content);
		if(res.find()) {
			System.out.println(name+"\t"+res.group(1));
		}
	}
	public void test() throws IOException
	{
		String dst="content.txt";
		BufferedReader bufferedReader=new BufferedReader(new FileReader(dst));
		String line=null;
		StringBuffer content=new StringBuffer();
		while ((line=bufferedReader.readLine())!=null) {
			content.append(line);
		}
		bufferedReader.close();
		Pattern pattern=Pattern.compile("<a\\s.*?href=\"(/plus/view.php[^\"]+)\"[^>]*>(.*?)</a>");
		Matcher res = pattern.matcher(content);
		while(res.find()) {
			visit("http://ste.xidian.edu.cn"+res.group(1), res.group(2));
		}
	}
    public static void main(String[] args) {
		// TODO Auto-generated method stub 
    	try {
    		new Test2().test();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
    }
}