.NET實現網路爬蟲
阿新 • • 發佈:2019-01-22
爬蟲的特徵和執行方式
User-Agent:主要用來將我們的爬蟲偽裝成瀏覽器。
Cookie:主要用來儲存爬蟲的登入狀態。
連線數:主要用來限制單臺機器與服務端的連線數量。
代理IP:主要用來偽裝請求地址,提高單機併發數量。
爬蟲工作的方式可以歸納為兩種:深度優先、廣度優先。
深度優先就是一個連線一個連線的向內爬,處理完成後再換一下一個連線,這種方式對於我們來說缺點很明顯。
廣度優先就是一層一層的處理,非常適合利用多執行緒併發技術來高效處理,因此我們也用廣度優先的抓取方式。
首先我們用Visual Studio 2015建立一個控制檯程式,定義一個簡單的SimpleCrawler類,裡面只包含幾個簡單的事件:
public class SimpleCrawler
{
public SimpleCrawler() { }
/// <summary>
/// 爬蟲啟動事件
/// </summary>
public event EventHandler<OnStartEventArgs> OnStart;
/// <summary>
/// 爬蟲完成事件
/// </summary>
public event EventHandler<OnCompletedEventArgs> OnCompleted;
/// <summary>
/// 爬蟲出錯事件
/// </summary>
public event EventHandler<Exception> OnError;
/// <summary>
/// 定義cookie容器
/// </summary>
public CookieContainer CookieContainer { get; set; }
}
接著我們建立一個OnStart的事件物件:
然後我們建立一個OnCompleted事件物件:
最後我們再給它增加一個非同步方法,通過User-Agent將爬蟲偽裝成了Chrome瀏覽器
/// <summary>
/// 非同步建立爬蟲
/// </summary>
/// <param name="uri"></param>
/// <param name="proxy"></param>
/// <returns></returns>
public async Task<string> Start(Uri uri, WebProxy proxy = null)
{
return await Task.Run(() =>
{
var pageSource = string.Empty;
try
{
if (this.OnStart != null)
this.OnStart(this, new OnStartEventArgs(uri));
Stopwatch watch = new Stopwatch();
watch.Start();
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(uri);
request.Accept = "*/*";
//定義文件型別及編碼
request.ContentType = "application/x-www-form-urlencoded";
request.AllowAutoRedirect = false;//禁止自動跳轉
request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36";
//定義請求超時事件為5s
request.Timeout = 5000;
//長連線
request.KeepAlive = true;
request.Method = "GET";
//設定代理伺服器IP,偽裝請求地址
if (proxy != null)
request.Proxy = proxy;
//附加Cookie容器
request.CookieContainer = this.CookieContainer;
//定義最大連結數
request.ServicePoint.ConnectionLimit = int.MaxValue;
//獲取請求響應
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
//將Cookie加入容器,保持登入狀態
foreach (Cookie cookie in response.Cookies)
this.CookieContainer.Add(cookie);
//獲取響應流
Stream stream = response.GetResponseStream();
//以UTF8的方式讀取流
StreamReader reader = new StreamReader(stream,Encoding.UTF8);
//獲取網站資源
pageSource = reader.ReadToEnd();
watch.Stop();
//獲取當前任務執行緒ID
var threadID = Thread.CurrentThread.ManagedThreadId;
//獲取請求執行時間
var milliseconds = watch.ElapsedMilliseconds;
reader.Close();
stream.Close();
request.Abort();
response.Close();
if (this.OnCompleted != null)
this.OnCompleted(this, new OnCompletedEventArgs(uri, threadID, milliseconds, pageSource));
}
catch (Exception ex)
{
if (this.OnError != null)
this.OnError(this, ex);
}
return pageSource;
});
}
在控制檯裡寫下爬蟲的抓取程式碼:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
namespace TestPa
{
class Program
{
static void Main(string[] args)
{
//定義入口URl
var cityUrl = "http://hotels.ctrip.com/citylist";
//定義泛型列表存放城市名稱及對應的酒店
var cityList = new List<City>();
//呼叫自己寫的爬蟲程式
var cityCrawler = new SimpleCrawler();
cityCrawler.OnStart += (s, e) =>
{
Console.WriteLine("爬蟲開始抓取的地址:" + e.Uri.ToString());
};
cityCrawler.OnError += (s, e) =>
{
Console.WriteLine("爬蟲抓取出現錯誤:" + e.Message);
};
cityCrawler.OnCompleted += (s, e) =>
{
var links = Regex.Matches(e.PageSource, @"<a[^>]+href=""*(?<href>/hotel/[^>\s]+)""\s*[^>]*>(?<text>(?!.*img).*?)</a>", RegexOptions.IgnoreCase);
foreach(Match match in links)
{
var city = new City
{
CityName = match.Groups["text"].Value,
Uri = new Uri("http://hotels.ctrip.com" + match.Groups["href"].Value)
};
if (!cityList.Contains(city))
cityList.Add(city);
Console.WriteLine(city.CityName + "||" + city.Uri);
}
Console.WriteLine(e.PageSource);
Console.WriteLine("**********************************");
Console.WriteLine("爬蟲抓取完成");
Console.WriteLine("耗時:" + e.Milliseconds + " 毫秒");
Console.WriteLine("執行緒:" + e.ThreadID);
Console.WriteLine("地址:" + e.Uri.ToString());
};
cityCrawler.Start(new Uri(cityUrl)).Wait();
Console.ReadKey();
}
}
public class City
{
public string CityName { get; set; }
public Uri Uri { get; set; }
}
}
執行結果: