爬取京東收件地址下得所有資料
阿新 • • 發佈:2018-11-02
1.工具備用
能直接爬出京東的全國地址並拷貝到本地資料庫中,使用的話注意資料庫連線和表結構.package reptile; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.RandomAccessFile; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.nio.channels.FileChannel; import java.nio.channels.FileLock; import java.sql.Connection; import java.sql.DriverManager; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.atomic.AtomicInteger; import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONObject; /** * 京東服務地址 * @author daiyang * */ public class Reptile4 { public static int i = 0; public static int j = 0; public static void main(String[] args) throws Exception { //解析省編碼 String jdProvince = cover("\u5317\u4eac|1|72|1,\u4e0a\u6d77|2|78|1,\u5929\u6d25|3|51035|1," + "\u91cd\u5e86|4|113|1,\u6cb3\u5317|5|142,\u5c71\u897f|6|303,\u6cb3\u5357|7|412," + "\u8fbd\u5b81|8|560,\u5409\u6797|9|639,\u9ed1\u9f99\u6c5f|10|698,\u5185\u8499\u53e4|11|799," + "\u6c5f\u82cf|12|904,\u5c71\u4e1c|13|1000,\u5b89\u5fbd|14|1116,\u6d59\u6c5f|15|1158,\u798f\u5efa|16|1303," + "\u6e56\u5317|17|1381,\u6e56\u5357|18|1482,\u5e7f\u4e1c|19|1601,\u5e7f\u897f|20|1715,\u6c5f\u897f|21|1827," + "\u56db\u5ddd|22|1930,\u6d77\u5357|23|2121,\u8d35\u5dde|24|2144,\u4e91\u5357|25|2235,\u897f\u85cf|26|2951," + "\u9655\u897f|27|2376,\u7518\u8083|28|2487,\u9752\u6d77|29|2580,\u5b81\u590f|30|2628,\u65b0\u7586|31|2652," + "\u6e2f\u6fb3|52993|52994,\u53f0\u6e7e|32|2768,\u9493\u9c7c\u5c9b|84|84"); //讀取市源資料 String unicodeCity = readFile("D:\\test\\city.txt"); //解析市編碼 String jdCity = cover(unicodeCity); //獲取省資料 List<Map<String,Object>> provinceList = provinceDataHandle(jdProvince); //獲取市資料 List<Map<String, Object>> cityList= cityDataHandle(jdCity); //執行緒安全的區縣資料 ConcurrentLinkedQueue<Map<String,Object>> districtList = new ConcurrentLinkedQueue<Map<String,Object>>(); //執行緒安全的鄉鎮資料 ConcurrentLinkedQueue<Map<String,Object>> courtList = new ConcurrentLinkedQueue<Map<String,Object>>(); //資料處理 dataHandle(provinceList, cityList, districtList); System.out.println(JSON.toJSON(provinceList)); System.out.println(JSON.toJSON(cityList)); System.out.println(JSON.toJSON(districtList)); //開始表演---->>>>市Id請求 AtomicInteger atoI = new AtomicInteger(0); //查詢哪個市下得區縣鄉鎮資料,成都市列表第325個,綿陽市第329個 //容量上限為50的執行緒池 ExecutorService es = Executors.newFixedThreadPool(50); System.out.println("===========>>>>>>>>>>>>>>>>>>開始搜尋資料"); int taskNum = 1;// while(taskNum<=cityList.size()){//cityList.size()------------------------------------->>>開閘 Runnable task = new Runnable() { @Override public void run() { getDistrictInfo(courtList,districtList,cityList, atoI); } }; es.submit(task); taskNum++; } es.shutdown(); while(true){ if(es.isTerminated()){ System.out.println("---END---\n"); System.out.println("所有的子執行緒都結束了!"); //*************************************資料正確處理*******************************************//* System.out.println("=================>>>>>>>>>>>>>>>開始存入資料庫"); //addProvinceData(provinceList);//新增省資料 //addCityData(cityList);//新增市資料 //addDistructData(districtList);//新增區縣資料 3600多個 addTownData(courtList);//新增鄉鎮資料 39836個 break; } Thread.sleep(1000); } } static Connection conn; static PreparedStatement ps; static ResultSet rs; /** * 寫一個連線資料庫的方法 */ public static Connection getConnection(){ String url="jdbc:mysql://localhost:port/database"; String userName="username"; String password="password"; try { Class.forName("com.mysql.jdbc.Driver"); } catch (ClassNotFoundException e) { System.out.println("找不到驅動!"); e.printStackTrace(); } try { conn=DriverManager.getConnection(url, userName, password); if(conn!=null){ System.out.println("connection successful"); } } catch (SQLException e) { // TODO Auto-generated catch block System.out.println( "connection fail"); e.printStackTrace(); } return conn; } public static int addTownData(ConcurrentLinkedQueue<Map<String,Object>> list){ int row=0; String sql="insert into tb_town(name,districtId,jdTownId) values(?,?,?)"; try { conn=getConnection();//連線資料庫 ps=conn.prepareStatement(sql);// 2.建立Satement並設定引數 // rs=ps.executeQuery(); // 3.ִ執行SQL語句,緊緊用於查詢語句 //sql語句中寫了幾個欄位,下面就必須要有幾個欄位 for(Map<String,Object> map:list){ System.out.println("FBIWARNING i....:"+(i++)); ps.setString(1, (String)map.get("name")); ps.setInt(2, Integer.valueOf((String)map.get("districtId"))); ps.setInt(3, Integer.valueOf((String)map.get("id"))); // 4.處理結果集 row=ps.executeUpdate(); } } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); }finally{ try { ps.close(); conn.close(); } catch (SQLException e) { e.printStackTrace(); } } return row; } public static int addDistructData(ConcurrentLinkedQueue<Map<String,Object>> list){ int row=0; String sql="insert into tb_district(name,cityId,jdDistrictId) values(?,?,?)"; try { conn=getConnection();//連線資料庫 ps=conn.prepareStatement(sql);// 2.建立Satement並設定引數 // rs=ps.executeQuery(); // 3.ִ執行SQL語句,緊緊用於查詢語句 //sql語句中寫了幾個欄位,下面就必須要有幾個欄位 for(Map<String,Object> map:list){ ps.setString(1, (String)map.get("name")); ps.setInt(2, Integer.valueOf((String)map.get("cityId"))); ps.setInt(3, Integer.valueOf((String)map.get("id"))); // 4.處理結果集 row=ps.executeUpdate(); } } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); }finally{ try { ps.close(); conn.close(); } catch (SQLException e) { e.printStackTrace(); } } return row; } public static int addCityData(List<Map<String,Object>> list){ int row=0; String sql="insert into tb_city(name,provinceId,jdCityId) values(?,?,?)"; try { conn=getConnection();//連線資料庫 ps=conn.prepareStatement(sql);// 2.建立Satement並設定引數 // rs=ps.executeQuery(); // 3.ִ執行SQL語句,緊緊用於查詢語句 //sql語句中寫了幾個欄位,下面就必須要有幾個欄位 for(Map<String,Object> map:list){ ps.setString(1, (String)map.get("cityName")); ps.setInt(2, Integer.valueOf((String)map.get("provinceId"))); ps.setInt(3, Integer.valueOf((String)map.get("cityId"))); // 4.處理結果集 row=ps.executeUpdate(); } } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); }finally{ try { ps.close(); conn.close(); } catch (SQLException e) { e.printStackTrace(); } } return row; } public static int addProvinceData(List<Map<String,Object>> list){ int row=0; String sql="insert into tb_province(name,provinceCode,provinceType,jdProvinceId) values(?,?,?,?)"; try { conn=getConnection();//連線資料庫 ps=conn.prepareStatement(sql);// 2.建立Satement並設定引數 // rs=ps.executeQuery(); // 3.ִ執行SQL語句,緊緊用於查詢語句 //sql語句中寫了幾個欄位,下面就必須要有幾個欄位 for(Map<String,Object> map:list){ ps.setString(1, (String)map.get("provinceName")); ps.setString(2, (String)map.get("provinceCode")); ps.setInt(3, Integer.valueOf((String)map.get("provinceType"))); ps.setInt(4, Integer.valueOf((String)map.get("provinceId"))); // 4.處理結果集 row=ps.executeUpdate(); } } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); }finally{ try { ps.close(); conn.close(); } catch (SQLException e) { e.printStackTrace(); } } return row; } private static void dataHandle(List<Map<String, Object>> provinceList, List<Map<String, Object>> cityList, ConcurrentLinkedQueue<Map<String, Object>> districtList) { //1.直轄市下面的區跑到市去了 Iterator<Map<String, Object>> iterator = cityList.iterator(); while(iterator.hasNext()){ Map<String,Object> map = iterator.next(); if(Integer.valueOf((String)map.get("provinceId"))<=4){ iterator.remove(); } } for(int i = 0; i<4;i++){ Map<String, Object> map = provinceList.get(i); Map<String, Object> newMap = new HashMap<String, Object>(); newMap.put("cityName", map.get("provinceName")); newMap.put("cityId", map.get("provinceId")); newMap.put("provinceId", map.get("provinceId")); cityList.add(newMap); } } //獲取鄉鎮資料 private static void getCourtInfo( ConcurrentLinkedQueue<Map<String,Object>> courtList,String districtId) { //爬取第四級鄉鎮資料 String url = "https://d.jd.com/area/get?fid="+districtId; //System.out.println(url); try { String request = request(url); JSONArray jarr=JSONArray.parseArray(request);//JSON.parseArray(jsonStr); for (Iterator iterator = jarr.iterator(); iterator.hasNext();) { Map<String,Object> one = new HashMap<>(); JSONObject job=(JSONObject)iterator.next(); String name=job.get("name").toString(); String id = job.getString("id").toString(); one.put("name", name); one.put("id", id); one.put("districtId",districtId); courtList.add(one); } } catch (Exception e) { e.printStackTrace(); System.out.println("請求地址錯誤"); } } //獲取區縣資料 private static void getDistrictInfo( ConcurrentLinkedQueue<Map<String,Object>> courtList,ConcurrentLinkedQueue<Map<String,Object>> districtList, List<Map<String, Object>> cityList, AtomicInteger atoI) { Map<String, Object> map = cityList.get(atoI.getAndIncrement()); //爬取第三級區縣資料 String url = "https://d.jd.com/area/get?fid="+map.get("cityId"); //System.out.println(url); try { String request = request(url); JSONArray jarr=JSONArray.parseArray(request);//JSON.parseArray(jsonStr); for (Iterator iterator = jarr.iterator(); iterator.hasNext();) { Map<String,Object> one = new HashMap<>(); JSONObject job=(JSONObject)iterator.next(); String name=job.get("name").toString(); String id = job.getString("id").toString(); one.put("name", name); one.put("id", id); one.put("cityId",map.get("cityId")); districtList.add(one); // getCourtInfo(courtList,id); } } catch (Exception e) { e.printStackTrace(); System.out.println("請求地址錯誤"); } } //發起請求 private static String request(String url) throws Exception { // 定義一個緩衝字元輸入流 BufferedReader in = null; // 將string轉成url物件 URL realUrl = new URL(url); // 初始化一個連結到那個url的連線 URLConnection connection = realUrl.openConnection(); // 開始實際的連線 connection.connect(); // 初始化 BufferedReader輸入流來讀取URL的響應 in = new BufferedReader(new InputStreamReader(connection.getInputStream())); String line = null; String content = ""; while((line = in.readLine())!=null){ content+=line; } return content; } //市資料處理 private static List<Map<String,Object>> cityDataHandle(String cityStr){ String[] cityStrs = cityStr.split("hello,dy"); List<Map<String,Object>> cityList = new ArrayList<Map<String,Object>>(); for(String city:cityStrs){ String delResult = city.replace("\"", ""); String split[] = delResult.split(":"); String[] cities = split[1].split(","); for(String str:cities){ Map<String,Object> one = new HashMap<String, Object>(); if(str.trim().isEmpty())//空白串處理 continue; String data[] = str.split("\\|"); one.put("cityName", data[0]); one.put("cityId", data[1]); one.put("provinceId", split[0].replace("\t", "")); cityList.add(one); } } return cityList; } //讀取市資料來源 private static String readFile(String fileName){ File file = new File(fileName); String content = ""; try { BufferedReader br = new BufferedReader(new FileReader(file)); String line; while((line=br.readLine())!=null){ content += (line+"hello,dy"); } } catch (Exception e) { e.printStackTrace(); } return content.substring(0,content.length()-8); } //省資料處理 private static List<Map<String,Object>> provinceDataHandle(String provinceStr){ String[] proviceStrs = provinceStr.split(","); List<Map<String,Object>> provinceList = new ArrayList<Map<String,Object>>(); for(String provice:proviceStrs){ Map<String,Object> one = new HashMap<String, Object>(); String data[] = provice.split("\\|"); one.put("provinceName", data[0]); one.put("provinceId", data[1]); one.put("provinceCode", data[2]); if(data.length>3) one.put("provinceType", data[3]);//1為直轄市 else one.put("provinceType", "2");//2為省 provinceList.add(one); } return provinceList; } //解碼規則--16進位制unicode編碼 public static String cover(String s){ String re = "",sub = null; char c1,c2; for(int i=0;i<s.length()-1;i++) { c1 = s.charAt(i); c2 = s.charAt(i+1); if(c1 == '\\' && c2 =='u'){ sub = s.substring(i+2,i+6); re = re + (char)Integer.parseInt(sub,16); i+=5; } else{ re = re+c1; } } return re; } //NIO非阻塞式讀寫 @SuppressWarnings("static-access") public static void writeByNIO(String content,File file) { RandomAccessFile fout = null; FileChannel fcout = null; try { fout = new RandomAccessFile(file, "rw"); long filelength = fout.length();//獲取檔案的長度 fout.seek(filelength);//將檔案的讀寫指標定位到檔案的末尾 fcout = fout.getChannel();//開啟檔案通道 FileLock flout = null; while (true) { try { flout = fcout.tryLock();//不斷的請求鎖,如果請求不到,等一秒再請求 break; } catch (Exception e) { System.out.print("lock is exist ......"); Thread.currentThread().sleep(1000); } } fout.write(content.getBytes());//將需要寫入的內容寫入檔案 flout.release(); fcout.close(); fout.close(); } catch (IOException e1) { e1.printStackTrace(); System.out.print("file no find ..."); } catch (InterruptedException e) { e.printStackTrace(); } finally { if (fcout != null) { try { fcout.close(); } catch (IOException e) { e.printStackTrace(); fcout = null; } } if (fout != null) { try { fout.close(); } catch (IOException e) { e.printStackTrace(); fout = null; } } } } }