1. 程式人生 > 實用技巧 >C# 採集頁面資料

C# 採集頁面資料

using HtmlAgilityPack;
using Nito.AsyncEx;
using System;
using System.Diagnostics;
using System.IO;
using System.IO.Compression;
using System.Net;
using System.Text;
using System.Threading;
using System.Threading.Tasks;

namespace test1
{
    class Program
    {
        static void Main(string[] args)
        {
            
var uri = new Uri("https://www.baidu.com/"); string pageHtml=AsyncContext.Run(() => GetHttpDomByUrl(uri)); HtmlDocument thisnode = new HtmlDocument(); thisnode.LoadHtml(pageHtml); var tnode = thisnode.DocumentNode; if (tnode.SelectSingleNode("
//*/a[@name=\"tj_login\"]") != null) { Console.WriteLine("獲取到的資料為:"+tnode.SelectSingleNode("//*/a[@name=\"tj_login\"]").Attributes["href"].Value); } Console.WriteLine("測試成功"); Console.ReadKey(); } public static CookieContainer CookiesContainer = new
CookieContainer();//定義Cookie容器 static CookieContainer cookie = new CookieContainer();//設定為全域性,這樣可以方便每個函式直接呼叫 public static async Task<string> GetHttpDomByUrl(Uri uri, string proxy = null) { Thread.Sleep(1000); return await Task.Run(() => { var pageSource = string.Empty; try { //模擬瀏覽器請求 //if (this.OnStart != null) this.OnStart(this, new OnStartEventArgs(uri)); var watch = new Stopwatch(); watch.Start(); var request = (HttpWebRequest)WebRequest.Create(uri); request.Accept = "*/*"; request.ServicePoint.Expect100Continue = false;//加快載入速度 request.ServicePoint.UseNagleAlgorithm = false;//禁止Nagle演算法加快載入速度 request.AllowWriteStreamBuffering = false;//禁止緩衝加快載入速度 request.Headers.Add(HttpRequestHeader.AcceptEncoding, "gzip,deflate");//定義gzip壓縮頁面支援 request.ContentType = "application/x-www-form-urlencoded";//定義文件型別及編碼 request.AllowAutoRedirect = true;//禁止自動跳轉 //設定User-Agent,偽裝成Google Chrome瀏覽器 request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"; request.Timeout = 5000;//定義請求超時時間為5秒 request.KeepAlive = true;//啟用長連線 request.Method = "GET";//定義請求方式為GET request.CookieContainer = cookie; if (proxy != null) { request.Proxy = new WebProxy(proxy);//設定代理伺服器IP,偽裝請求地址 } //request.CookieContainer = this.CookiesContainer;//附加Cookie容器 request.ServicePoint.ConnectionLimit = int.MaxValue;//定義最大連線數 using (var response = (HttpWebResponse)request.GetResponse()) { //獲取請求響應 foreach (Cookie cookie in response.Cookies) { CookiesContainer.Add(cookie);//將Cookie加入容器,儲存登入狀態 } //判斷網頁是否被gzip壓縮 if (response.ContentEncoding.ToLower().Contains("gzip")) { //解壓 using (GZipStream stream = new GZipStream(response.GetResponseStream(), CompressionMode.Decompress)) { using (StreamReader reader = new StreamReader(stream, Encoding.UTF8)) { pageSource = reader.ReadToEnd(); } } } //判斷網頁http頭中是否Content-Encoding:deflate else if (response.ContentEncoding.ToLower().Contains("deflate")) { //解壓 using (DeflateStream stream = new DeflateStream(response.GetResponseStream(), CompressionMode.Decompress)) { using (StreamReader reader = new StreamReader(stream, Encoding.UTF8)) { pageSource = reader.ReadToEnd(); } } } //正常流獲取網頁 else { using (Stream stream = response.GetResponseStream())//原始 { using (StreamReader reader = new StreamReader(stream, Encoding.UTF8)) { pageSource = reader.ReadToEnd(); } } } } request.Abort(); watch.Stop(); var threadId = System.Threading.Thread.CurrentThread.ManagedThreadId;//獲取當前任務執行緒ID var milliseconds = watch.ElapsedMilliseconds;//獲取請求執行時間 //if (this.OnCompleted != null) //{ // this.OnCompleted(this, new OnCompletedEventArgs(uri, threadId, milliseconds, pageSource)); // // Console.WriteLine("程式執行完成"); //} } catch (Exception ex) { Console.WriteLine($"hello, task的執行緒ID為{Thread.CurrentThread.ManagedThreadId}"); Console.WriteLine(uri + $"請求頁面失敗正在重新請求,當前執行緒{Thread.CurrentThread.ManagedThreadId}:" + ex.Message.ToString()); Thread.Sleep(1000); return AsyncContext.Run(() => GetHttpDomByUrl(uri)); } return pageSource; }); } } }