爬蟲程式提取資訊(統計符合條件的檔案數量)
阿新 • • 發佈:2019-02-10
var URL = "http://www.xinhuanet.com", url = URL, total = 0, count = 0, x = 0, num = 0; var sum_png = 0, sum_jpg = 0, sum_gif = 0, sum_jpeg = 0, arr = [], js = {}, mark = {}; arr.push(URL), mark[url] = 1; var request = require('request'); function callback(error, response, body) { if (!error && response.statusCode == 200) { var cnt = 0; var index = 0; while (true) { var start = body.indexOf("http://", index); if (start == -1) { break; } else { var end = body.indexOf(".com", start + 7) + 4; var strURL = body.substring(start, end); var i; for (i = 0; i < arr.length; i++) { if (strURL == arr[i]) { break; } } if (i == arr.length && strURL.length < 25) { arr.push(strURL); mark[strURL] = count + 2; cnt++; } index = end + 1; } } var index = 0; while (true) { var start = body.indexOf("https://", index); if (start == -1) { break; } else { var end = body.indexOf(".com", start + 8) + 4; var strURL = body.substring(start, end); var i; for (i = 0; i < arr.length; i++) { if (strURL == arr[i]) { break; } } if (i == arr.length && strURL.length < 25) { arr.push(strURL); mark[strURL] = count + 2; cnt++; } index = end + 1; } } var cnt_png = 0, begin_index = 0; while (true) { var new_index = body.indexOf(".png", begin_index); if (new_index == -1) { break; } else { begin_index = new_index + 4; cnt_png++; } } var cnt_jpg = 0, begin_index = 0; while (true) { var new_index = body.indexOf(".jpg", begin_index); if (new_index == -1) { break; } else { begin_index = new_index + 4; cnt_jpg++; } } var cnt_gif = 0, begin_index = 0; while (true) { var new_index = body.indexOf(".gif", begin_index); if (new_index == -1) { break; } else { begin_index = new_index + 4; cnt_gif++; } } var cnt_jpeg = 0, begin_index = 0; while (true) { var new_index = body.indexOf(".jpeg", begin_index); if (new_index == -1) { break; } else { begin_index = new_index + 5; cnt_jpeg++; } } var sum = cnt_png + cnt_jpg + cnt_gif + cnt_jpeg; total += sum; var str = { "URL ": URL, "index ": x, ".png檔案個數 ": cnt_png, ".jpg檔案個數 ": cnt_jpg, ".gif檔案個數 ": cnt_gif, ".jpeg檔案個數 ": cnt_jpeg, "該層符合條件的檔案總數 ": sum, }; if (cnt == 0 && count <= 8) { if (count == 0) { console.log(JSON.stringify(str)); console.log("抓取不到合適的URL,執行結束。"); return; } else { console.log("warn:該頁(index:", x, ",URL:", URL, ")抓取不到合適的URL,先回上一層再繼續往下走。"); URL = arr[++x]; request(URL, callback); } } else { console.log("\n當前位置:第 ", count + 1, " 層"); console.log(JSON.stringify(str)); console.log("前", count + 1, "層符合條件的檔案數量累計:", total); if (cnt == 0) { console.log("該頁沒有抓到合適的URL。"); } else { console.log("陣列儲存該頁(抓取的)URL的下標範圍:", arr.length - cnt, " 到 ", arr.length - 1); } if (cnt > 0) { console.log("第", count + 1, "層抓取到的URL:"); for (var i = arr.length - cnt; i < arr.length; i++) { console.log("index:", i, ",URL:", arr[i]); } } count++; if (count == 10) { console.log("\n\n陣列元素(URL)對應的鍵值:"); function time(error, response, body) { if (!error && response.statusCode == 200) { var cnt_png = 0, begin_index = 0; while (true) { var new_index = body.indexOf(".png", begin_index); if (new_index == -1) { break; } else { begin_index = new_index + 4; cnt_png++; } } sum_png += cnt_png; var cnt_jpg = 0, begin_index = 0; while (true) { var new_index = body.indexOf(".jpg", begin_index); if (new_index == -1) { break; } else { begin_index = new_index + 4; cnt_jpg++; } } sum_jpg += cnt_jpg; var cnt_gif = 0, begin_index = 0; while (true) { var new_index = body.indexOf(".gif", begin_index); if (new_index == -1) { break; } else { begin_index = new_index + 4; cnt_gif++; } } sum_gif += cnt_gif; var cnt_jpeg = 0, begin_index = 0; while (true) { var new_index = body.indexOf(".jpeg", begin_index); if (new_index == -1) { break; } else { begin_index = new_index + 5; cnt_jpeg++; } } sum_jpeg += cnt_jpeg; var sum = cnt_png + cnt_jpg + cnt_gif + cnt_jpeg; var str = { "URL ": url, "層級 ": mark[url], ".png檔案個數 ": cnt_png, ".jpg檔案個數 ": cnt_jpg, ".gif檔案個數 ": cnt_gif, ".jpeg檔案個數 ": cnt_jpeg, "該頁符合條件的檔案總數 ": sum, }; js[url] = JSON.stringify(str); console.log("index:", num, ",URL(Key):", url, "\nMessage(Vaule):", js[url], "\n"); if (num < arr.length - 1) { url = arr[++num]; request(url, time); } else { var str = { "全部URL.png檔案個數累計 ": sum_png, "全部URL.jpg檔案個數累計 ": sum_jpg, "全部URL.gif檔案個數累計 ": sum_gif, "全部URL.jpeg檔案個數累計 ": sum_jpeg, "全部URL符合條件的檔案數量累計 ": sum_png + sum_jpg + sum_gif + sum_jpeg }; console.log(JSON.stringify(str)); return; } } else { var str = { "URL ": url, "層級 ": mark[url], "index ": num, "error ": "URL錯誤。" } js[url] = JSON.stringify(str); console.log("index:", num, ",URL(Key):", url, "\nMessage(Vaule):", js[url], "\n"); if (num < arr.length - 1) { url = arr[++num]; request(url, time); } else { var str = { "全部URL.png檔案個數累計 ": sum_png, "全部URL.jpg檔案個數累計 ": sum_jpg, "全部URL.gif檔案個數累計 ": sum_gif, "全部URL.jpeg檔案個數累計 ": sum_jpeg, "全部URL符合條件的檔案數量累計 ": sum_png + sum_jpg + sum_gif + sum_jpeg }; console.log(JSON.stringify(str)); return; } } } request(url, time); return; } x = arr.length - cnt; URL = arr[x]; console.log("去到下一個層\n"); request(URL, callback); } } else { if (count == 0) { console.log("URL錯誤,執行結束。"); return; } else { console.log("error:", "URL錯誤(index:", x, ",URL:", URL, ")"); URL = arr[++x]; request(URL, callback); } } } request(URL, callback);