1. 程式人生 > >如何通過URL抓取網站的資料

如何通過URL抓取網站的資料

使用的外部dll

HtmlAgilityPack 目的,將html原始碼解析成xml格式方便使用

1.通過url獲取網站原始碼,這裡是一個比較簡單的方法,不過方法不怪乎簡單,能用就行

public string getWeb(string url, Encoding en)
        {
            CookieCollection cookies = new CookieCollection();//如何從response.Headers["Set-Cookie"];中獲取並設定CookieCollection的程式碼略  
            HttpWebResponse response = HttpWebResponseUtility.CreateGetHttpResponse(url, null, null, cookies);//下面有HttpWebResponseUtility類的下載
            Stream sm = response.GetResponseStream();
            System.IO.StreamReader streamReader = new System.IO.StreamReader(sm, en);//Encoding.Default
            //將流轉換為字串
            string html = streamReader.ReadToEnd();
            streamReader.Close();
            string cookieString = response.Headers["Set-Cookie"];
            return html;
        }

2.獲取原始碼之後通過HtmlAgilityPack解析原始碼
//規則類
public class TestWeb {
        public string testMatch { set; get; }//匹配規則
        public string test { set; get; }//返回值
    }

解析原始碼,返回List資料

public List<TestWeb> getLegalPage(Category category)
        {
            TestWeb  test = new TestWeb();
            test.testMatch = "//tr[2]/td[2]";//
            CreateHtml ch = new CreateHtml("", "");
            HtmlDocument document = new HtmlDocument();
            document.LoadHtml(ch.getWeb("url路勁", "url編碼"));
            HtmlNode rootNode = document.DocumentNode;
            HtmlNodeCollection categoryNodeList = rootNode.SelectNodes("//html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/div[1]/table");//匹配所有的資料
            HtmlNode temp = null;
            List<TestWeb> list = new List<TestWeb>();
            TestWeb test1 = new TestWeb();
            foreach (HtmlNode categoryNode in categoryNodeList)
            {
                temp = HtmlNode.CreateNode(categoryNode.OuterHtml);
                try
                {
                    test1 = new TestWeb();
                    test1.ggr = temp.SelectSingleNode(test.testMatch).InnerText.Replace(" ", "");
                    list.Add(test1);
                }
                catch { 
                   
                }
            }
            return list;
        }
一個簡單的使用url獲取資料的方法就完了。

這裡提供一個http請求輔助類HttpWebResponseUtility(轉載自-----)

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net.Security;
using System.Security.Cryptography.X509Certificates;
using System.DirectoryServices.Protocols;
using System.ServiceModel.Security;
using System.Net;
using System.IO;
using System.IO.Compression;
using System.Text.RegularExpressions;

namespace Common
{
    /// <summary>  
    /// 有關HTTP請求的輔助類  
    /// </summary>  
    public class HttpWebResponseUtility
    {
        private static readonly string DefaultUserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)";
        /// <summary>  
        /// 建立GET方式的HTTP請求  
        /// </summary>  
        /// <param name="url">請求的URL</param>  
        /// <param name="timeout">請求的超時時間</param>  
        /// <param name="userAgent">請求的客戶端瀏覽器資訊,可以為空</param>  
        /// <param name="cookies">隨同HTTP請求傳送的Cookie資訊,如果不需要身份驗證可以為空</param>  
        /// <returns></returns>  
        public static HttpWebResponse CreateGetHttpResponse(string url, int? timeout, string userAgent, CookieCollection cookies)
        {
            if (string.IsNullOrEmpty(url))
            {
                throw new ArgumentNullException("url");
            }
            HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
            request.Method = "GET";
            request.UserAgent = DefaultUserAgent;
            if (!string.IsNullOrEmpty(userAgent))
            {
                request.UserAgent = userAgent;
            }
            if (timeout.HasValue)
            {
                request.Timeout = timeout.Value;
            }
            if (cookies != null)
            {
                request.CookieContainer = new CookieContainer();
                request.CookieContainer.Add(cookies);
            }
            return request.GetResponse() as HttpWebResponse;
        }
        /// <summary>  
        /// 建立POST方式的HTTP請求  
        /// </summary>  
        /// <param name="url">請求的URL</param>  
        /// <param name="parameters">隨同請求POST的引數名稱及引數值字典</param>  
        /// <param name="timeout">請求的超時時間</param>  
        /// <param name="userAgent">請求的客戶端瀏覽器資訊,可以為空</param>  
        /// <param name="requestEncoding">傳送HTTP請求時所用的編碼</param>  
        /// <param name="cookies">隨同HTTP請求傳送的Cookie資訊,如果不需要身份驗證可以為空</param>  
        /// <returns></returns>  
        public static HttpWebResponse CreatePostHttpResponse(string url, IDictionary<string, string> parameters, int? timeout, string userAgent, Encoding requestEncoding, CookieCollection cookies)
        {
            if (string.IsNullOrEmpty(url))
            {
                throw new ArgumentNullException("url");
            }
            if (requestEncoding == null)
            {
                throw new ArgumentNullException("requestEncoding");
            }
            HttpWebRequest request = null;
            //如果是傳送HTTPS請求  
            if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase))
            {
                ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult);
                request = WebRequest.Create(url) as HttpWebRequest;
                request.ProtocolVersion = HttpVersion.Version10;
            }
            else
            {
                request = WebRequest.Create(url) as HttpWebRequest;
            }
            request.Method = "POST";
            request.ContentType = "application/x-www-form-urlencoded";

            if (!string.IsNullOrEmpty(userAgent))
            {
                request.UserAgent = userAgent;
            }
            else
            {
                request.UserAgent = DefaultUserAgent;
            }

            if (timeout.HasValue)
            {
                request.Timeout = timeout.Value;
            }
            if (cookies != null)
            {
                request.CookieContainer = new CookieContainer();
                request.CookieContainer.Add(cookies);
            }
            //如果需要POST資料  
            if (!(parameters == null || parameters.Count == 0))
            {
                StringBuilder buffer = new StringBuilder();
                int i = 0;
                foreach (string key in parameters.Keys)
                {
                    if (i > 0)
                    {
                        buffer.AppendFormat("&{0}={1}", key, parameters[key]);
                    }
                    else
                    {
                        buffer.AppendFormat("{0}={1}", key, parameters[key]);
                    }
                    i++;
                }
                byte[] data = requestEncoding.GetBytes(buffer.ToString());
                using (Stream stream = request.GetRequestStream())
                {
                    stream.Write(data, 0, data.Length);
                }
            }
            return request.GetResponse() as HttpWebResponse;
        }

        private static bool CheckValidationResult(object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors)
        {
            return true; //總是接受  
        }
    }  
}