爬取href超連結,正則指定目標結果
阿新 • • 發佈:2019-02-13
import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; import java.util.regex.Matcher; import java.util.regex.Pattern; public class Test2 { private Pattern pattern2; public Test2() { // TODO Auto-generated constructor stub pattern2=Pattern.compile("<span[^>]+>(主樓[^<]*)</span>"); } public void download(String string) throws IOException { URL url=new URL(string); HttpURLConnection httpURLConnection=(HttpURLConnection) url.openConnection(); BufferedReader bufferedReader=new BufferedReader(new InputStreamReader(httpURLConnection.getInputStream(),"utf8")); String dst="content.txt"; BufferedWriter bufferedWriter=new BufferedWriter(new FileWriter(dst)); StringBuffer content=new StringBuffer(); String line=null; while ((line=bufferedReader.readLine())!=null) { content.append(line); } bufferedWriter.write(content.toString()); bufferedWriter.close(); bufferedReader.close(); } public void visit(String string,String name) throws IOException { URL url=new URL(string); HttpURLConnection httpURLConnection=(HttpURLConnection) url.openConnection(); BufferedReader bufferedReader=new BufferedReader(new InputStreamReader(httpURLConnection.getInputStream(),"utf-8")); StringBuffer content=new StringBuffer(); String line=null; while ((line=bufferedReader.readLine())!=null) { content.append(line); } bufferedReader.close(); // <span style="mso-spacerun:'yes'; font-size:10.5000pt; font-family:'楷體_GB2312'; ">主樓二區137</span> // <span style="mso-spacerun:'yes'; font-size:14.0000pt; font-family:'楷體_GB2312'; ">導師姓名</span> Matcher res = pattern2.matcher(content); if(res.find()) { System.out.println(name+"\t"+res.group(1)); } } public void test() throws IOException { String dst="content.txt"; BufferedReader bufferedReader=new BufferedReader(new FileReader(dst)); String line=null; StringBuffer content=new StringBuffer(); while ((line=bufferedReader.readLine())!=null) { content.append(line); } bufferedReader.close(); Pattern pattern=Pattern.compile("<a\\s.*?href=\"(/plus/view.php[^\"]+)\"[^>]*>(.*?)</a>"); Matcher res = pattern.matcher(content); while(res.find()) { visit("http://ste.xidian.edu.cn"+res.group(1), res.group(2)); } } public static void main(String[] args) { // TODO Auto-generated method stub try { new Test2().test(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }