1. 程式人生 > >爬蟲程式提取資訊(統計符合條件的檔案數量)

爬蟲程式提取資訊(統計符合條件的檔案數量)

var URL = "http://www.xinhuanet.com", url = URL, total = 0, count = 0, x = 0, num = 0;
var sum_png = 0, sum_jpg = 0, sum_gif = 0, sum_jpeg = 0, arr = [], js = {}, mark = {};
arr.push(URL), mark[url] = 1;
var request = require('request');

function callback(error, response, body) {
    if (!error && response.statusCode == 200) {
        var cnt = 0;
        var index = 0;
        while (true) {
            var start = body.indexOf("http://", index);
            if (start == -1) {
                break;
            }
            else {
                var end = body.indexOf(".com", start + 7) + 4;
                var strURL = body.substring(start, end);
                var i;
                for (i = 0; i < arr.length; i++) {
                    if (strURL == arr[i]) {
                        break;
                    }
                }
                if (i == arr.length && strURL.length < 25) {
                    arr.push(strURL);
                    mark[strURL] = count + 2;
                    cnt++;
                }
                index = end + 1;
            }
        }

        var index = 0;
        while (true) {
            var start = body.indexOf("https://", index);
            if (start == -1) {
                break;
            }
            else {
                var end = body.indexOf(".com", start + 8) + 4;
                var strURL = body.substring(start, end);
                var i;
                for (i = 0; i < arr.length; i++) {
                    if (strURL == arr[i]) {
                        break;
                    }
                }
                if (i == arr.length && strURL.length < 25) {
                    arr.push(strURL);
                    mark[strURL] = count + 2;
                    cnt++;
                }
                index = end + 1;
            }
        }

        var cnt_png = 0, begin_index = 0;
        while (true) {
            var new_index = body.indexOf(".png", begin_index);
            if (new_index == -1) {
                break;
            }
            else {
                begin_index = new_index + 4;
                cnt_png++;
            }
        }

        var cnt_jpg = 0, begin_index = 0;
        while (true) {
            var new_index = body.indexOf(".jpg", begin_index);
            if (new_index == -1) {
                break;
            }
            else {
                begin_index = new_index + 4;
                cnt_jpg++;
            }
        }

        var cnt_gif = 0, begin_index = 0;
        while (true) {
            var new_index = body.indexOf(".gif", begin_index);
            if (new_index == -1) {
                break;
            }
            else {
                begin_index = new_index + 4;
                cnt_gif++;
            }
        }

        var cnt_jpeg = 0, begin_index = 0;
        while (true) {
            var new_index = body.indexOf(".jpeg", begin_index);
            if (new_index == -1) {
                break;
            }
            else {
                begin_index = new_index + 5;
                cnt_jpeg++;
            }
        }

        var sum = cnt_png + cnt_jpg + cnt_gif + cnt_jpeg;
        total += sum;
        var str = {
            "URL ": URL,
            "index ": x,
            ".png檔案個數 ": cnt_png,
            ".jpg檔案個數 ": cnt_jpg,
            ".gif檔案個數 ": cnt_gif,
            ".jpeg檔案個數 ": cnt_jpeg,
            "該層符合條件的檔案總數 ": sum,
        };

        if (cnt == 0 && count <= 8) {
            if (count == 0) {
                console.log(JSON.stringify(str));
                console.log("抓取不到合適的URL,執行結束。");
                return;
            }
            else {
                console.log("warn:該頁(index:", x, ",URL:", URL, ")抓取不到合適的URL,先回上一層再繼續往下走。");
                URL = arr[++x];
                request(URL, callback);
            }
        }
        else {
            console.log("\n當前位置:第 ", count + 1, " 層");
            console.log(JSON.stringify(str));
            console.log("前", count + 1, "層符合條件的檔案數量累計:", total);

            if (cnt == 0) {
                console.log("該頁沒有抓到合適的URL。");
            }
            else {
                console.log("陣列儲存該頁(抓取的)URL的下標範圍:", arr.length - cnt, " 到 ", arr.length - 1);
            }

            if (cnt > 0) {
                console.log("第", count + 1, "層抓取到的URL:");
                for (var i = arr.length - cnt; i < arr.length; i++) {
                    console.log("index:", i, ",URL:", arr[i]);
                }
            }

            count++;
            if (count == 10) {
                console.log("\n\n陣列元素(URL)對應的鍵值:");
                function time(error, response, body) {
                    if (!error && response.statusCode == 200) {
                        var cnt_png = 0, begin_index = 0;
                        while (true) {
                            var new_index = body.indexOf(".png", begin_index);
                            if (new_index == -1) {
                                break;
                            }
                            else {
                                begin_index = new_index + 4;
                                cnt_png++;
                            }
                        }
                        sum_png += cnt_png;

                        var cnt_jpg = 0, begin_index = 0;
                        while (true) {
                            var new_index = body.indexOf(".jpg", begin_index);
                            if (new_index == -1) {
                                break;
                            }
                            else {
                                begin_index = new_index + 4;
                                cnt_jpg++;
                            }
                        }
                        sum_jpg += cnt_jpg;

                        var cnt_gif = 0, begin_index = 0;
                        while (true) {
                            var new_index = body.indexOf(".gif", begin_index);
                            if (new_index == -1) {
                                break;
                            }
                            else {
                                begin_index = new_index + 4;
                                cnt_gif++;
                            }
                        }
                        sum_gif += cnt_gif;

                        var cnt_jpeg = 0, begin_index = 0;
                        while (true) {
                            var new_index = body.indexOf(".jpeg", begin_index);
                            if (new_index == -1) {
                                break;
                            }
                            else {
                                begin_index = new_index + 5;
                                cnt_jpeg++;
                            }
                        }
                        sum_jpeg += cnt_jpeg;

                        var sum = cnt_png + cnt_jpg + cnt_gif + cnt_jpeg;
                        var str = {
                            "URL ": url,
                            "層級 ": mark[url],
                            ".png檔案個數 ": cnt_png,
                            ".jpg檔案個數 ": cnt_jpg,
                            ".gif檔案個數 ": cnt_gif,
                            ".jpeg檔案個數 ": cnt_jpeg,
                            "該頁符合條件的檔案總數 ": sum,
                        };
                        js[url] = JSON.stringify(str);
                        console.log("index:", num, ",URL(Key):", url, "\nMessage(Vaule):", js[url], "\n");

                        if (num < arr.length - 1) {
                            url = arr[++num];
                            request(url, time);
                        }
                        else {
                            var str = {
                                "全部URL.png檔案個數累計 ": sum_png,
                                "全部URL.jpg檔案個數累計 ": sum_jpg,
                                "全部URL.gif檔案個數累計 ": sum_gif,
                                "全部URL.jpeg檔案個數累計 ": sum_jpeg,
                                "全部URL符合條件的檔案數量累計 ": sum_png + sum_jpg + sum_gif + sum_jpeg
                            };
                            console.log(JSON.stringify(str));
                            return;
                        }
                    }
                    else {
                        var str = {
                            "URL ": url,
                            "層級 ": mark[url],
                            "index ": num,
                            "error ": "URL錯誤。"
                        }
                        js[url] = JSON.stringify(str);
                        console.log("index:", num, ",URL(Key):", url, "\nMessage(Vaule):", js[url], "\n");

                        if (num < arr.length - 1) {
                            url = arr[++num];
                            request(url, time);
                        }
                        else {
                            var str = {
                                "全部URL.png檔案個數累計 ": sum_png,
                                "全部URL.jpg檔案個數累計 ": sum_jpg,
                                "全部URL.gif檔案個數累計 ": sum_gif,
                                "全部URL.jpeg檔案個數累計 ": sum_jpeg,
                                "全部URL符合條件的檔案數量累計 ": sum_png + sum_jpg + sum_gif + sum_jpeg
                            };
                            console.log(JSON.stringify(str));
                            return;
                        }
                    }
                }
                request(url, time);
                return;
            }
            x = arr.length - cnt;
            URL = arr[x];
            console.log("去到下一個層\n");
            request(URL, callback);
        }
    }
    else {
        if (count == 0) {
            console.log("URL錯誤,執行結束。");
            return;
        }
        else {
            console.log("error:", "URL錯誤(index:", x, ",URL:", URL, ")");
            URL = arr[++x];
            request(URL, callback);
        }
    }
}
request(URL, callback);