使用Html Agility Pack快速解析Html內容
阿新 • • 發佈:2019-01-13
Html Agility Pack 是一個開源的.NET 方案HTML解析器。
開源地址:https://github.com/zzzprojects/html-agility-pack
用法:vs上通過Nuget搜尋Html Agility Pack並安裝
示例程式碼1:
/// <summary> /// 獲取網頁內容 /// </summary> /// <returns></returns> private static string GetHtml() {string html = string.Empty; string url = "http://quote.eastmoney.com/stocklist.html"; using (var client = new HttpClient()) { client.BaseAddress = new Uri(url); //關鍵程式碼1:設定請求頭採用GZip和deflate兩種壓縮演算法 client.DefaultRequestHeaders.Add("Accept-Encoding", "gzip, deflate"); var response = client.GetAsync(url).Result; var fileStream = response.Content.ReadAsStreamAsync().Result; //關鍵程式碼2:對檔案流採用GZip演算法解壓 GZipStream gzip = new GZipStream(fileStream, CompressionMode.Decompress);using (StreamReader reader = new StreamReader(gzip, Encoding.GetEncoding("gb2312")))//中文編碼處理 { html = reader.ReadToEnd(); //File.WriteAllText(@"C:\stock.html", reader.ReadToEnd(), Encoding.Default); } } return html; }
示例程式碼2:
static void Main(string[] args) { string html= GetHtml(); var doc = new HtmlDocument(); doc.LoadHtml(html); //查詢dom節點div的Id為quotesearch下所有ul下的li下的所有a節點 var nodes = doc.DocumentNode.SelectNodes("//div[@id='quotesearch']/ul/li/a"); foreach (var node in nodes) { var arrays=node.InnerText.Split('('); Console.WriteLine(string.Format("股票名稱:{0},股票程式碼:{1}", arrays[0], arrays[1].Replace(")",""))); } //File.WriteAllText(@"C:\stock.html", matches.ToString(), Encoding.Default); Console.ReadKey(); }