1. 程式人生 > >c#抓取網頁(帶解析js)

c#抓取網頁(帶解析js)

抓取中國銀行匯率 (phantomjs-1.9.2-windows + Selenium.WebDriver.3.8.0)

直接上程式碼

using LTITools.util;
using OpenQA.Selenium;
using OpenQA.Selenium.PhantomJS;
using System;
using System.Collections.Generic;
using System.Data;
using System.IO;
using System.Linq;
using System.Text.RegularExpressions;
using System.Threading;
using System.Windows.Forms;

namespace LTITools
{

    /// <summary>
    /// 1.第一次訪問,頭部會寫cookies,同時返回的是js,js是加密後的字串,需要反序列號然後執行。
    /// 2.js會判斷當前的瀏覽器window的寬高
    /// 3.js執行後會再次寫cookies,同時跳轉指定的解析出來的地址。
    /// 4.第二個地址回寫cookies,同時頭部302跳轉。
    /// 5.後續需要帶著一二次的訪問返回的cookies進行訪問。
    /// </summary>
    public partial class ChinaBankRate : Form
    {
        public ChinaBankRate()
        {
            InitializeComponent();
            InitData();
        }

        int _foreachPageCount = 1;
        string _url = "";
        int _totalCount = 1;
        int _totalPage = 0;
        int _stopMSec = 1000;
        IEnumerable<ChinaBankRateListItem> monthList;


        private void InitData()
        {
            txtUrl.Text = "http://www.pbc.gov.cn/zhengcehuobisi/125207/125217/125925/17105/index{0}.html";
            chkClearOldData.Checked = false;

            dtBeginDate.Text = DateTime.Now.AddMonths(-1).ToShortDateString();
            dtBeginDate.CustomFormat = "yyyy-MM";
            dtEndDate.CustomFormat = "yyyy-MM";
            dtBeginDate.Format = DateTimePickerFormat.Custom;
            dtEndDate.Format = DateTimePickerFormat.Custom;
            dtBeginDate.MinDate = Convert.ToDateTime("2015-8-1");
            dtEndDate.MinDate = Convert.ToDateTime("2015-8-1");
            dtEndDate.MaxDate = DateTime.Now;

            txtAbout.AppendText(" 1.請先通過[第一步,資料抓取]Tab進行抓取,抓取會遇到IP禁用、防抓取網路異常等," +
                "如有異常,可以進行多次抓取(注:抓取過程中會彈出黑框介面,抓取完成後會自動關閉);");
            txtAbout.AppendText("\n 2.抓取成功後,通過[第二步,資料匯出]匯出指定月份的資料(注:如果抓取過程中,則不能進行匯出);");
            txtAbout.AppendText("\n 3.僅能匯出指定當月日期最大的匯率資料(注:僅支援匯出2015年8月以後的資料);");
        }

        private void ClearData()
        {
            File.Delete(GetExcelPath());
        }


        /// <summary>
        /// 第一輪先抓取列表資料
        /// 第二輪在迴圈列表資料抓取具體內容
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        private void btnOk_Click(object sender, EventArgs e)
        {
            btnOk.Enabled = false;
            btnExportData.Enabled = false;
            _url = txtUrl.Text.Trim();
            _stopMSec = Convert.ToInt32(txtStop.Text.Trim());
            _stopMSec = _stopMSec < 100 ? 100 : _stopMSec;

            if (chkClearOldData.Checked)
            {
                ClearData();
            }
            Thread t = new Thread(new ThreadStart(DoWorkGetList));
            t.IsBackground = true;
            t.Start();
        }


        private void GotoURLAndCheckCookies(PhantomJSDriver driver, string url)
        {
            //var js = "var w= window.innerWidth||document.documentElement.clientWidth||document.body.clientWidth;var h= window.innerHeight||document.documentElement.clientHeight||document.body.clientHeight;"
            //    + "document.body.setAttribute(\"wh\", w*h);";
            //driver.ExecutePhantomJS(js);
            //var wh = driver.FindElement(By.TagName("body")).GetAttribute("wh");

            //if (driver.PageSource.Contains("dynamicurl"))
            //{
            //    driver.ExecuteScript("document.body.setAttribute(\"cookieString\", HXXTTKKLLPPP5);");
            //    var cookieString = driver.FindElement(By.TagName("body")).GetAttribute("cookieString");
            //    var newjs = cookieString.Replace("if(findDimensions())", "if(false)")
            //        .Replace("if(findDimensions())", "if(false)");
            //    driver.ExecutePhantomJS(newjs + " HXXTTKKLLPPP5();");
            //} 
        }


        private void DoWorkGetList()
        {
            Action<String> AsynclblResultAsy = delegate(string text) { lblResult.Text = text; };
            Action<String> AsyncUIDelegateResult = delegate(string text) { txtResult.AppendText(text); };
            Action AsyncUIDelegateDone = delegate() { btnOk.Enabled = true; };
            Action AsyncUIDelegateExportDone = delegate { btnExportData.Enabled = true; };


            txtResult.Invoke(AsyncUIDelegateResult, new object[] { DateTime.Now.ToShortTimeString() + "開始執行,正在抓取列表資料...\n" });

            PhantomJSDriver driver = new PhantomJSDriver(GetPhantomJSDriverService());
            driver.Manage().Window.Size = new System.Drawing.Size() { Height = 800, Width = 600 };

            var ExistDatalists = GetExistData();
            var historyTotalCount = ExistDatalists.Count();

            for (var i = 1; i <= _foreachPageCount; i++)
            {
                var url = string.Format(_url, i);


                driver.Navigate().GoToUrl(url);
                ///隨機暫停 
                int randKey = new Random().Next(100, _stopMSec);
                Thread.Sleep(randKey);

                //總條數和分頁
                if (i == 1)
                {
                    Thread.Sleep(_stopMSec);
                    var totalinfos = driver.FindElement(By.CssSelector("td[class='Normal']"));
                    if (null == totalinfos || !string.IsNullOrEmpty(totalinfos.Text))
                    {
                        Thread.Sleep(_stopMSec);
                        totalinfos = driver.FindElement(By.CssSelector("td[class='Normal']"));
                    }


                    if (null != totalinfos && !string.IsNullOrEmpty(totalinfos.Text))
                    {
                        _totalCount = Convert.ToInt32(totalinfos.Text.Split(',')[0].Split(':')[1]);

                        if (historyTotalCount != _totalCount)
                        {
                            if ((_totalCount - historyTotalCount) % 20 != 0)
                                _totalPage = (_totalCount - historyTotalCount) / 20 + 1;
                            else _totalPage = (_totalCount - historyTotalCount) / 20;
                        }
                        else
                        {
                            break;
                        }
                        _foreachPageCount = _totalPage;
                    }
                    else
                    {
                        txtResult.Invoke(AsyncUIDelegateResult,
                            new object[] { "totalinfos為空,抓取異常,請稍後試\n" });
                        break;
                    }
                }

                ///如果當前也超出
                if (i > _foreachPageCount)
                    break;

                //list資料
                var lists = driver.FindElements(By.CssSelector("font[class='newslist_style'] > a"));


                if (lists == null || lists.Count == 0)
                {
                    Thread.Sleep(_stopMSec);
                    lists = driver.FindElements(By.CssSelector("font[class='newslist_style'] > a"));
                }

                if (lists == null || lists.Count == 0)
                {
                    txtResult.Invoke(AsyncUIDelegateResult, new object[] { "lists為空,抓取異常,請稍後試\n" });
                    break;
                }
                var breakFlag = "";
                foreach (var item in lists)
                {
                    var identifierDate = item.Text.Split('中')[0].Trim();
                    if (!ExistDatalists.Any(t => t.Identifier == identifierDate) && Convert.ToDateTime(identifierDate) > Convert.ToDateTime("2015-8-1"))
                    {
                        ExistDatalists.Add(new ChinaBankRateListItem()
                        {
                            PIdentifier = i.ToString(),
                            Identifier = identifierDate,
                            Href = item.GetAttribute("href"),
                            Title = item.Text,
                            IsSucess = "true",
                            HtmlContent = "",
                        });
                    }
                    else
                    {
                        breakFlag = identifierDate;
                        break;
                    }
                }
                if (!string.IsNullOrEmpty(breakFlag))
                {
                    txtResult.Invoke(AsyncUIDelegateResult, new object[] { "當前已包含 " + breakFlag + "\n" });
                    break;
                }

                txtResult.Invoke(AsyncUIDelegateResult, new object[] { "處理完成行第[" + i + "]條列表(暫停" + randKey + "毫秒),url:" + url + "\n" });

            }

            SaveDataToExcel(ExistDatalists.OrderByDescending(t => Convert.ToDateTime(t.Identifier)));
            txtResult.Invoke(AsyncUIDelegateResult, new object[] { "列表資料抓取完成!\n" });

            //開始抓取列表資料
            var blret = DoWorkGetDetail(driver, AsyncUIDelegateResult, AsynclblResultAsy);

            btnOk.Invoke(AsyncUIDelegateDone);
            btnExportData.Invoke(AsyncUIDelegateExportDone);
            lblResult.Invoke(AsynclblResultAsy, new object[] { string.Format("全部處理完成({0}),{1}", (blret ? "成功" : "有異常,請繼續點選抓取開始"), DateTime.Now) });

            driver.Quit();
        }


        private bool DoWorkGetDetail(PhantomJSDriver driver, Action<String> AsyncUIDelegate, Action<String> AsynclblResultAsy)
        {
            bool blret = true;
            var ExistDatalists = GetExistData();
            try
            {
                var items = ExistDatalists.Where(t => string.IsNullOrEmpty(t.HtmlContent)).ToArray();
                lblResult.Invoke(AsynclblResultAsy, new object[] { string.Format("開始抓取詳情頁面,總共{0}條資料...", items.Count()) });

                for (var i = 0; i < items.Count(); i++)
                {
                    driver.Navigate().GoToUrl(items[i].Href);
                    ///隨機暫停 
                    int randKey = new Random().Next(100, _stopMSec);
                    Thread.Sleep(randKey);

                    var content = driver.FindElementByCssSelector("div[id='zoom'] > p");
                    if (content == null || string.IsNullOrEmpty(content.Text))
                    {
                        Thread.Sleep(_stopMSec);
                        content = driver.FindElementByCssSelector("div[id='zoom'] > p");
                    }

                    if (content != null && !string.IsNullOrEmpty(content.Text))
                    {
                        items[i].HtmlContent = content.Text;
                        items[i].IsSucess = "true";
                    }
                    else
                    {
                        items[i].IsSucess = "false";
                    }
                    txtResult.Invoke(AsyncUIDelegate, new object[] { string.Format("處理第[{0}]條(暫停{1}毫秒),日期[{2}] \n", i + 1, randKey, items[i].Identifier) });

                }

            }
            catch (Exception ex)
            {
                blret = false;
                txtResult.Invoke(AsyncUIDelegate, new object[] { "詳情資料抓取異常" + ex.StackTrace + "\n" });
            }
            SaveDataToExcel(ExistDatalists);
            txtResult.Invoke(AsyncUIDelegate, new object[] { "詳情資料抓取完成!\n" });
            txtResult.Invoke(AsyncUIDelegate, new object[] { "全部資料抓取完成!\n" });
            return blret;
        }


        /// <summary>
        /// 設定代理
        /// </summary>
        /// <returns></returns>
        private static PhantomJSDriverService GetPhantomJSDriverService()
        {
            PhantomJSDriverService pds = PhantomJSDriverService.CreateDefaultService();
            //設定代理伺服器地址
            //pds.Proxy = $"{ip}:{port}";  
            //設定代理伺服器認證資訊
            //pds.ProxyAuthentication = GetProxyAuthorization();
            return pds;
        }


        private bool SaveDataToExcel(IEnumerable<ChinaBankRateListItem> list)
        {
            var dt = new DataTable();

            dt.Columns.Add("Identifier");
            dt.Columns.Add("PIdentifier");
            dt.Columns.Add("Title");
            dt.Columns.Add("IsSucess");
            dt.Columns.Add("Href");
            dt.Columns.Add("HtmlContent");
            DataRow dr = null;

            foreach (var item in list)
            {
                dr = dt.NewRow();
                var index = 0;
                dr[index++] = item.Identifier;
                dr[index++] = item.PIdentifier;
                dr[index++] = item.Title;
                dr[index++] = item.IsSucess;
                dr[index++] = item.Href;
                dr[index++] = item.HtmlContent;
                dt.Rows.Add(dr);
            }

            File.Delete(GetExcelPath());
            ExcelHelper.SaveExcelToFile(GetExcelPath(), dt);
            return true;
        }

        private string GetExcelPath(string name = "data")
        {
            DirectoryInfo baseDir = new DirectoryInfo(AppDomain.CurrentDomain.BaseDirectory);
            return baseDir + "\\" + name + ".xlsx";
        }

        private List<ChinaBankRateListItem> GetExistData()
        {
            var path = GetExcelPath();
            var list = new List<ChinaBankRateListItem>();
            if (File.Exists(path))
            {
                var dt = ExcelHelper.ReadExcelFile(path, 0);
                foreach (DataRow row in dt.Rows)
                {
                    list.Add(new ChinaBankRateListItem()
                    {
                        Href = row["Href"].ToString().Trim(),
                        HtmlContent = row["HtmlContent"].ToString().Trim(),
                        Identifier = row["Identifier"].ToString().Trim(),
                        Title = row["Title"].ToString().Trim(),
                        IsSucess = row["IsSucess"].ToString().Trim(),
                        PIdentifier = row["PIdentifier"].ToString().Trim(),
                    });
                }
            }
            return list;
        }

        /// <summary>
        /// 匯出資料
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        private void btnExportData_Click(object sender, EventArgs e)
        {
            btnExportData.Enabled = false;
            btnExportData.Text = "資料匯出中...";

            var list = GetExistData();
            var beginDate = new DateTime(dtBeginDate.Value.Year,dtBeginDate.Value.Month,1) ;
            var endDate = new DateTime(dtEndDate.Value.Year, dtEndDate.Value.Month, 1).AddMonths(1).AddDays(-1); 

            var export = list.Where(t => beginDate <= Convert.ToDateTime(t.Identifier)
                && Convert.ToDateTime(t.Identifier) <= endDate).OrderByDescending(t => Convert.ToDateTime(t.Identifier));
            monthList = export.GroupBy(t => Convert.ToDateTime(t.Identifier).ToString("yyyy-MM")).Select(t => t.First());

            //截圖
            Thread t1 = new Thread(new ThreadStart(ScreenCapture));
            t1.IsBackground = true;
            t1.Start();


            var allExportData = new List<ChinaBankRateExport>();
            foreach (var item in monthList)
            {
                var datas = GetRateFromHtmlContent(Convert.ToDateTime(item.Identifier), item.HtmlContent);
                allExportData.AddRange(datas);
            }

            var dt = new DataTable();
            dt.Columns.Add("起兌幣種");
            dt.Columns.Add("兌換幣種");
            dt.Columns.Add("匯率");
            dt.Columns.Add("狀態");
            dt.Columns.Add("生效日期");
            dt.Columns.Add("備註");
            DataRow dr = null;
            foreach (var item in allExportData)
            {
                dr = dt.NewRow();
                var index = 0;
                dr[index++] = item.From;
                dr[index++] = item.To;
                dr[index++] = item.Rate;
                dr[index++] = item.Status;
                dr[index++] = item.EffectiveDate;
                dr[index++] = item.Des;
                dt.Rows.Add(dr);
            }


            string baseDir = new DirectoryInfo(AppDomain.CurrentDomain.BaseDirectory) + "\\Export";
            if (!Directory.Exists(baseDir)) Directory.CreateDirectory(baseDir);
            var fullPath = baseDir + "\\From" + beginDate.ToString("yyyyMM") + "To"
                + endDate.ToString("yyyyMM")
                + "_" + DateTime.Now.ToString("yyyyMMddHHmmss") + ".xlsx";

            ExcelHelper.SaveExcelToFile(fullPath, dt);
            System.Diagnostics.Process.Start(fullPath);
            btnExportData.Enabled = true;
            btnExportData.Text = "匯出指定日期資料";
            lblExportResult.Text = "匯出完成!" + DateTime.Now.ToShortDateString();
        }


        private List<ChinaBankRateExport> GetRateFromHtmlContent(DateTime dt, string htmlContent)
        {
            //var reg = @"(?i)(?<={0})(\d+(\.\d+)?)(?={1})";
            var exportRate = new List<ChinaBankRateExport>();
            var CNYtoFlags = new string[] { "人民幣1元對" };
            var toCNYFlags = new string[] { "對人民幣" };

            var arrHtml = htmlContent.Split(new char[] { ',', ',' }, StringSplitOptions.RemoveEmptyEntries);

            decimal rate = 0M;
            var reg = new Regex(@"\d+\.\d*");

            foreach (var item in RateDic.RateNameDic)
            {
                foreach (var html in arrHtml)
                {
                    if (html.Contains(item.Key))
                    {
                        var math = reg.Match(html);
                        if (math.Success)
                        {
                            rate = Convert.ToDecimal(math.Value);
                            if (CNYtoFlags.Any(t => html.Contains(t)))
                            {  //人民幣對外幣需要轉換成外幣對人民幣
                                rate = MathHelper.Round6P(1 / rate);
                            }
                            else if (toCNYFlags.Any(t => html.Contains(t)))
                            {

                            }

                            if (html.Contains("100日元"))
                            {
                                rate = MathHelper.Round6P(rate / 100);
                            }

                            exportRate.Add(new ChinaBankRateExport()
                             {
                                 Des = item.Key,
                                 EffectiveDate = dt,
                                 From = item.Value,
                                 To = "CNY",
                                 Rate = rate,
                                 Status = "有效",
                             });

                            break;
                        }
                    }

                }
            }

            return exportRate;



        }


        private void ScreenCapture()
        {
            Action AsynclbtnShowImg = delegate { btnShowImg.Visible = true; };
            string baseDir = new DirectoryInfo(AppDomain.CurrentDomain.BaseDirectory) + "\\Export\\Capture\\";
            if (!Directory.Exists(baseDir)) Directory.CreateDirectory(baseDir);
            try
            {
                foreach (var item in monthList)
                {
                    var fullPath = baseDir + item.Identifier + ".png";
                    if (!File.Exists(fullPath))
                    {
                        System.Diagnostics.Process p = new System.Diagnostics.Process();
                        p.StartInfo.WindowStyle = System.Diagnostics.ProcessWindowStyle.Hidden;
                        p.StartInfo.FileName = "phantomjs.exe";
                        p.StartInfo.WorkingDirectory = AppDomain.CurrentDomain.BaseDirectory;
                        p.StartInfo.Arguments = " rasterize.js  " + item.Href + " " + fullPath;//啟動引數    
                        p.Start();
                        p.WaitForExit(5000);
                    }
                }
            }
            catch (Exception ex)
            {
                // lblExportResult.Invoke(AsynclblExportResult, new object[] { ex.StackTrace });
            }

            lblExportResult.Invoke(AsynclbtnShowImg);
        }

        private void btnShowImg_Click(object sender, EventArgs e)
        {
            string baseDir = new DirectoryInfo(AppDomain.CurrentDomain.BaseDirectory) + "\\Export\\Capture";
            System.Diagnostics.Process.Start(baseDir);
        }


    }



    #region 實體類
    public class ChinaBankRateListItem
    {
        public string Identifier { set; get; }//日期 
        public string PIdentifier { set; get; }//父標識  
        public string Title { set; get; }
        public string IsSucess { set; get; }

        public string Href { set; get; }

        public string HtmlContent { set; get; }
    }

    /*
    HKD	港幣
    IDR	印度尼西亞盧比
    INR	印度盧比
    USD	美元
    EUR	歐元
    GBP	英鎊
    TWD	新臺幣
    CAD	加拿大元
    MXN	墨西哥比索
    AUD	澳大利亞元
    BRL	巴西雷阿爾
    KRW	韓國元
    MYR	馬來西亞林吉特
    JPY	日元
    ZAR	南非蘭特
    THB	泰國銖
    CHF	瑞士法郎
    SGD	新加坡元
    NZD	紐西蘭元
    PHP	菲律賓比索
    MOP	澳門元
    CNY	人民幣
    NZD  紐西蘭元
    SGD 新加坡
    RUB 俄羅斯盧布
    KRW 韓元
         */

    /// <summary>
    /// 剛開始是正則匹配,但是特色字元,空格出現問題,後面通過字串擷取和包含解決問題。
    /// </summary>
    public static class RateDic
    {

        public static Dictionary<string, string> RateNameDic { set; get; }

        static RateDic()
        {
            RateNameDic = new Dictionary<string, string>();
            RateNameDic.Add("美元", "USD");
            RateNameDic.Add("印度盧比", "INR");
            RateNameDic.Add("歐元", "EUR");
            RateNameDic.Add("日元", "JPY");
            RateNameDic.Add("港元", "HKD");
            RateNameDic.Add("英鎊", "GBP");
            RateNameDic.Add("澳大利亞元", "AUD");
            RateNameDic.Add("紐西蘭元", "NZD");
            RateNameDic.Add("新加坡元", "SGD");
            RateNameDic.Add("瑞士法郎", "CHF");
            RateNameDic.Add("加拿大元", "CAD");
            RateNameDic.Add("俄羅斯盧布", "RUB");

            RateNameDic.Add("林吉特", "MYR");
            RateNameDic.Add("南非蘭特", "ZAR");
            RateNameDic.Add("韓元", "KRW");
            //RateNameDic.Add("阿聯酋迪拉姆", "AED");
            // RateNameDic.Add("沙特里亞爾", "SAR");
            //  RateNameDic.Add("匈牙利福林", "HUF");
            // RateNameDic.Add("波蘭茲羅提", "PLN");
            // RateNameDic.Add("丹麥克朗", "DKK");
            //  RateNameDic.Add("瑞典克朗", "SEK");
            //  RateNameDic.Add("挪威克朗", "NOK");
            //RateNameDic.Add("土耳其里拉", "TRY");
            RateNameDic.Add("墨西哥比索", "MXN");


        }

    }

    public class ChinaBankRateExport
    {

        public string From { set; get; }

        public string To { set; get; }

        public decimal Rate { set; get; }

        public DateTime EffectiveDate { set; get; }

        public string Status { set; get; }

        public string Des { set; get; }

    }


    #endregion




}


1.這個網站不是直接通過ajax請求資料,如果是這樣通過等待就可以抓取到資料,該網站先是通過返回的js生成cookie,然後帶上cookie訪問動態地址,然後再生成cookie,帶上所有的cookie,再去訪問302,最後得到結果。具體看參考http://www.jianshu.com/p/11fac0596020

2.參考抓取獲取cookies https://www.cnblogs.com/songxingzhu/p/7110723.html

3.獲取裡面的js變數 http://michaelthelin.se/javascript/testing/webdriver/2013/02/14/webdriver-reading-the-value-of-a-javascript-variable-spoiler-weirdness.html 

參考2:

1.這個網站處理辦法如下:

1、進入搜尋頁面,得到js
2、htmlfile.write反混淆js,得到類似的兩個函式function KTKY2RBD9NHPBCIHV9ZMEQQDARSLVFDU(str)和function QWERTASDFGXYSF()
3、執行這兩個函式,得到兩個cookie
        cookieString = "wzwstemplate=" + KTKY2RBD9NHPBCIHV9ZMEQQDARSLVFDU(template.toString()) + "; path=/";
        var confirm = QWERTASDFGXYSF();

        cookieString = "wzwschallenge=" + KTKY2RBD9NHPBCIHV9ZMEQQDARSLVFDU(confirm.toString()) + "; path=/";
4、根據dynamicurl中的地址,帶著三個Cookie: wzwsconfirm=   wzwstemplate=  wzwschallenge= 
      得到Cookie ccpassport=,和320跳轉
5、帶著4個cookie,經過兩次320,就可以進入search頁面,獲得JSESSIONID,後面就好辦了

這個網站比較複雜。

參考3:

1.http://www.cnblogs.com/endlock/p/6423613.html 使用Selenium來操作PhantomJS絕配

2.收費:https://www.nrecosite.com/phantomjs_wrapper_net.aspx

3.抓取中行 http://xusheng.org/blog/2016/10/19/ru-he-zhua-qu-diao-cha-tong-ji-si-de-shu-ju/ 

4.Webdriver: Reading the value of a Javascript variable (spoiler: weirdness):

 http://michaelthelin.se/javascript/testing/webdriver/2013/02/14/webdriver-reading-the-value-of-a-javascript-variable-spoiler-weirdness.html

5. Python小記:selenium+PhantomJS爬蟲解決頁面js新增cookie : https://www.jianshu.com/p/11fac0596020