使用HttpWebRequest和HtmlAgilityPack抓取網頁(無亂碼)
阿新 • • 發佈:2018-11-28
public string HttpGet(string url) { string responsestr = ""; HttpWebRequest req = HttpWebRequest.Create(url) as HttpWebRequest; req.Accept = "*/*"; req.Method = "GET"; req.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1"; using (HttpWebResponse response = req.GetResponse() as HttpWebResponse) { Stream stream; if (response.ContentEncoding.ToLower().Contains("gzip")) { stream = new GZipStream(response.GetResponseStream(), CompressionMode.Decompress); } else if (response.ContentEncoding.ToLower().Contains("deflate")) { stream = new DeflateStream(response.GetResponseStream(), CompressionMode.Decompress); } else { stream = response.GetResponseStream(); } using (StreamReader reader = new StreamReader(stream, GetEncoding(response.CharacterSet))) { responsestr = reader.ReadToEnd(); stream.Dispose(); } } return responsestr; }
public Encoding GetEncoding(string CharacterSet) { switch (CharacterSet) { case "gb2312": return Encoding.GetEncoding("gb2312"); case "utf-8": return Encoding.UTF8; default: return Encoding.Default; } }
呼叫HttpGet就可以獲取網址的原始碼了,得到原始碼後, 再用HtmlAgility來解析html了。
string html = HttpGet("http://www.cnblogs.com/"); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); //獲取文章列表 var artlist = doc.DocumentNode.SelectNodes("//div[@class='post_item']"); foreach (var item in artlist) { HtmlDocument adoc = new HtmlDocument(); adoc.LoadHtml(item.InnerHtml); var html_a = adoc.DocumentNode.SelectSingleNode("//a[@class='titlelnk']"); Response.Write(string.Format("標題為:{0},連結為:{1}<br>",html_a.InnerText,html_a.Attributes["href"].Value)); }