如何通過URL抓取網站的資料
阿新 • • 發佈:2019-02-05
使用的外部dll
HtmlAgilityPack 目的,將html原始碼解析成xml格式方便使用
1.通過url獲取網站原始碼,這裡是一個比較簡單的方法,不過方法不怪乎簡單,能用就行
public string getWeb(string url, Encoding en) { CookieCollection cookies = new CookieCollection();//如何從response.Headers["Set-Cookie"];中獲取並設定CookieCollection的程式碼略 HttpWebResponse response = HttpWebResponseUtility.CreateGetHttpResponse(url, null, null, cookies);//下面有HttpWebResponseUtility類的下載 Stream sm = response.GetResponseStream(); System.IO.StreamReader streamReader = new System.IO.StreamReader(sm, en);//Encoding.Default //將流轉換為字串 string html = streamReader.ReadToEnd(); streamReader.Close(); string cookieString = response.Headers["Set-Cookie"]; return html; }
2.獲取原始碼之後通過HtmlAgilityPack解析原始碼
//規則類
public class TestWeb {
public string testMatch { set; get; }//匹配規則
public string test { set; get; }//返回值
}
解析原始碼,返回List資料
一個簡單的使用url獲取資料的方法就完了。public List<TestWeb> getLegalPage(Category category) { TestWeb test = new TestWeb(); test.testMatch = "//tr[2]/td[2]";// CreateHtml ch = new CreateHtml("", ""); HtmlDocument document = new HtmlDocument(); document.LoadHtml(ch.getWeb("url路勁", "url編碼")); HtmlNode rootNode = document.DocumentNode; HtmlNodeCollection categoryNodeList = rootNode.SelectNodes("//html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/div[1]/table");//匹配所有的資料 HtmlNode temp = null; List<TestWeb> list = new List<TestWeb>(); TestWeb test1 = new TestWeb(); foreach (HtmlNode categoryNode in categoryNodeList) { temp = HtmlNode.CreateNode(categoryNode.OuterHtml); try { test1 = new TestWeb(); test1.ggr = temp.SelectSingleNode(test.testMatch).InnerText.Replace(" ", ""); list.Add(test1); } catch { } } return list; }
這裡提供一個http請求輔助類HttpWebResponseUtility(轉載自-----)
using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Net.Security; using System.Security.Cryptography.X509Certificates; using System.DirectoryServices.Protocols; using System.ServiceModel.Security; using System.Net; using System.IO; using System.IO.Compression; using System.Text.RegularExpressions; namespace Common { /// <summary> /// 有關HTTP請求的輔助類 /// </summary> public class HttpWebResponseUtility { private static readonly string DefaultUserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"; /// <summary> /// 建立GET方式的HTTP請求 /// </summary> /// <param name="url">請求的URL</param> /// <param name="timeout">請求的超時時間</param> /// <param name="userAgent">請求的客戶端瀏覽器資訊,可以為空</param> /// <param name="cookies">隨同HTTP請求傳送的Cookie資訊,如果不需要身份驗證可以為空</param> /// <returns></returns> public static HttpWebResponse CreateGetHttpResponse(string url, int? timeout, string userAgent, CookieCollection cookies) { if (string.IsNullOrEmpty(url)) { throw new ArgumentNullException("url"); } HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest; request.Method = "GET"; request.UserAgent = DefaultUserAgent; if (!string.IsNullOrEmpty(userAgent)) { request.UserAgent = userAgent; } if (timeout.HasValue) { request.Timeout = timeout.Value; } if (cookies != null) { request.CookieContainer = new CookieContainer(); request.CookieContainer.Add(cookies); } return request.GetResponse() as HttpWebResponse; } /// <summary> /// 建立POST方式的HTTP請求 /// </summary> /// <param name="url">請求的URL</param> /// <param name="parameters">隨同請求POST的引數名稱及引數值字典</param> /// <param name="timeout">請求的超時時間</param> /// <param name="userAgent">請求的客戶端瀏覽器資訊,可以為空</param> /// <param name="requestEncoding">傳送HTTP請求時所用的編碼</param> /// <param name="cookies">隨同HTTP請求傳送的Cookie資訊,如果不需要身份驗證可以為空</param> /// <returns></returns> public static HttpWebResponse CreatePostHttpResponse(string url, IDictionary<string, string> parameters, int? timeout, string userAgent, Encoding requestEncoding, CookieCollection cookies) { if (string.IsNullOrEmpty(url)) { throw new ArgumentNullException("url"); } if (requestEncoding == null) { throw new ArgumentNullException("requestEncoding"); } HttpWebRequest request = null; //如果是傳送HTTPS請求 if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase)) { ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult); request = WebRequest.Create(url) as HttpWebRequest; request.ProtocolVersion = HttpVersion.Version10; } else { request = WebRequest.Create(url) as HttpWebRequest; } request.Method = "POST"; request.ContentType = "application/x-www-form-urlencoded"; if (!string.IsNullOrEmpty(userAgent)) { request.UserAgent = userAgent; } else { request.UserAgent = DefaultUserAgent; } if (timeout.HasValue) { request.Timeout = timeout.Value; } if (cookies != null) { request.CookieContainer = new CookieContainer(); request.CookieContainer.Add(cookies); } //如果需要POST資料 if (!(parameters == null || parameters.Count == 0)) { StringBuilder buffer = new StringBuilder(); int i = 0; foreach (string key in parameters.Keys) { if (i > 0) { buffer.AppendFormat("&{0}={1}", key, parameters[key]); } else { buffer.AppendFormat("{0}={1}", key, parameters[key]); } i++; } byte[] data = requestEncoding.GetBytes(buffer.ToString()); using (Stream stream = request.GetRequestStream()) { stream.Write(data, 0, data.Length); } } return request.GetResponse() as HttpWebResponse; } private static bool CheckValidationResult(object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors) { return true; //總是接受 } } }