C#Xpath解析HtmlDocument的使用方法與遞迴取得頁面所有標籤xpath值(附原始碼)
引用:https://www.cnblogs.com/wangchuang/archive/2013/03/11/2953638.html
在學習HTML Xpath之前呢我們先來下載一下Dll檔案
下載地址:http://htmlagilitypack.codeplex.com/
大家下載單擊如下圖片下載就行了
<ignore_js_op>xpath1.jpg
接下來就是在程式中引用一下,
<ignore_js_op>xpath2.jpg
然後就可以直接呼叫 了,大家看看
程式碼吧
普通瀏覽複製程式碼
//htmlDcoument物件用來訪問Html文件s HtmlAgilityPack.HtmlDocument hd = new HtmlAgilityPack.HtmlDocument(); //載入Html文件 hd.LoadHtml(strhtml); string str = hd.DocumentNode.SelectSingleNode("//*[@id='e_font']").OuterHtml;
這樣就可以得到一個標籤的HTml程式碼了
OuterHtml是取包含本身的Html如果是InnerHtml就是取的包含在這個標籤之內的所有Html程式碼了
這點大家要注意了
如果大家想獲取Html程式碼的Xpath路徑就是這部分
//*[@id='e_font']
複製程式碼
這個其實很簡單隻在大家安裝一個Firbug就行了,
看下圖片
<ignore_js_op>xpath3.jpg
大家只要進入選擇模式,然後選擇你要的內容,然後右鍵複製一下就行了。
然後放在SelectSingleNode()方法裡就OK了
下面我說說幾個方法和屬性的意思吧、
方法
SelectNodes 獲取的是一個集合
SelectSingleNode 獲取一個標籤
SetAttributeValue 設定標籤的屬性值例如:SetAttributeValue("name","xpath-89");這說明把name屬性的值修改為xpath-89
屬性
OuterHtml 是取包含本身的Html
InnerHtml 取的包含在這個標籤之內的所有Html程式碼了
XPath 獲取相對應的Xpath值
Attributes 獲取一個屬性的值例如:Attributes("name")
也可以進行新增屬性例如:
普通瀏覽複製程式碼
hd.DocumentNode.SelectSingleNode(item.Key).Attributes.Add("xpathid", "xpath_1" );
下面我寫了一個遞迴獲取Html頁面所有Xpath值的方法大家看一下吧
普通瀏覽複製程式碼
//key(Xpath),value(整個節點) public List<ObjXpath> XpathList = new List<ObjXpath>(); public string strhtml = "";//這裡就是你的Html程式碼具體怎麼獲取請參考我的<a href=\"http://www.sufeinet.com/thread-3-1-1.html\" target=\"_blank\">HttpHelper</a>類吧 private int Index = 0; //開始處理Node private void SartNode() { //htmlDcoument物件用來訪問Html文件s HtmlAgilityPack.HtmlDocument hd = new HtmlAgilityPack.HtmlDocument(); //載入Html文件 hd.LoadHtml(strhtml); HtmlNodeCollection htmllist = hd.DocumentNode.ChildNodes; Index = 0; XpathList.Clear(); foreach (HtmlNode em in htmllist) { Setxpath(em); } } /// <summary> /// 遞迴獲取Html Dom /// </summary> /// <param name="node">要處理的節點</param> private void Setxpath(HtmlNode node) { foreach (HtmlNode item in node.ChildNodes) { if (item.XPath.Contains("#")) { continue; } if (item.ChildNodes.Count > 0) { XpathList.Add(new ObjXpath() { id = Index.ToString(), Key = item.XPath, Value = "" }); Index++; Setxpath(item); } else { XpathList.Add(new ObjXpath() { id = Index.ToString(), Key = item.XPath, Value = "" }); Index++; } } } public class ObjXpath { public string id { get; set; } public string Key { get; set; } public string Value { get; set; } }
XpathList 就是獲取的所有Xpath值了,大家有興趣的話可以試試
我們先來看看效果吧
<ignore_js_op>xpath4.jpg
好了下面放出所有程式碼給大家
普通瀏覽複製程式碼
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.Text.RegularExpressions;
using System.Threading;
using HtmlAgilityPack;
using System.IO;
using System.Runtime.Serialization.Json;
namespace AutoXpathTools
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
#region 私有變數和方法
//委託傳入一個字串
private delegate void SetListBox(string str);
//key(Xpath),value(整個節點)
List<ObjXpath> XpathList = new List<ObjXpath>();
private int Index = 0;
//htmlDcoument物件用來訪問Html文件
HtmlAgilityPack.HtmlDocument hd = new HtmlAgilityPack.HtmlDocument();
#endregion
//分析Xpath的所有程式碼
private void btnGetXpath_Click(object sender, EventArgs e)
{
try
{
HttpHelper http = new HttpHelper();
HttpItem item = new HttpItem() { URL = textBox1.Text.Trim(), IsToLower = false, Encoding = "gbk" };
txtXml.Text = http.GetHtml(item);
if (!string.IsNullOrWhiteSpace(txtXml.Text) && txtXml.Text.Trim().ToLower() != "error")
{
//載入Html文件
hd.LoadHtml(txtXml.Text);
Thread pingTask = new Thread(new ThreadStart(delegate
{
//程式碼,執行緒要執行的程式碼
SartNode(txtXml.Text);
}));
pingTask.Start();
}
else
{
txtXml.Text = "根據您的的ULR:" + textBox1.Text.Trim() + "無法得到任何內容";
}
}
catch (Exception ex)
{
txtXml.Text = ex.Message.Trim();
}
}
//開始處理Node
private void SartNode(string strhtml)
{
//htmlDcoument物件用來訪問Html文件s
HtmlAgilityPack.HtmlDocument hd = new HtmlAgilityPack.HtmlDocument();
//載入Html文件
hd.LoadHtml(strhtml);
HtmlNodeCollection htmllist = hd.DocumentNode.ChildNodes;
Index = 0;
XpathList.Clear();
foreach (HtmlNode em in htmllist)
{
Setxpath(em);
}
}
/// <summary>
/// 遞迴獲取Html Dom
/// </summary>
/// <param name="node">要處理的節點</param>
private void Setxpath(HtmlNode node)
{
foreach (HtmlNode item in node.ChildNodes)
{
if (item.XPath.Contains("#"))
{
continue;
}
if (item.ChildNodes.Count > 0)
{
XpathList.Add(new ObjXpath() { id = Index.ToString(), Key = item.XPath, Value = "" });
UIContorol(item.XPath);
Index++;
Setxpath(item);
}
else
{
XpathList.Add(new ObjXpath() { id = Index.ToString(), Key = item.XPath, Value = "" });
UIContorol(item.XPath);
Index++;
}
}
}
//使用委託給控制元件賦值
private void UIContorol(string str)
{
listBox1.Items.Add(str);
toolStripStatusLabel1.Text = str;
}
private void listBox1_SelectedValueChanged(object sender, EventArgs e)
{
if (listBox1.SelectedItem != null)
{
txtPath.Text = listBox1.SelectedItem.ToString().Trim();
}
}
private void button3_Click(object sender, EventArgs e)
{
txtContents.Text = hd.DocumentNode.SelectSingleNode(txtPath.Text.Trim()).OuterHtml;
}
private void Form1_Load(object sender, EventArgs e)
{
//HttpItem item = new HttpItem()
//{
// URL = "http://www.diandian.com/login",
// Method = "post",
// Cookie = "dtid=ZfXUVo1IsplHR4mHW1HYmgKbY4GJa003; kvf=1358855337188; alf=1; dru=1356356040; _l5=y",
// ContentType = "application/x-www-form-urlencoded",
// Postdata = "[email protected]&password=wjlove520&nextUrl=&lcallback=&persistent=1",
// Referer = "http://www.diandian.com/logout?formKey=e4714d863c862a84fafd83d98e5ecb22"
//};
//HttpHelper http = new HttpHelper();
//string html = http.GetHtml(item);
//string cookie = item.Cookie;
//item = new HttpItem() { URL = "http://www.diandian.com/home", Cookie = cookie };
//html = http.GetHtml(item);
}
}
public class ObjXpath
{
public string id { get; set; }
public string Key { get; set; }
public string Value { get; set; }
}
}
就到這裡吧,大家可以下載我的原始碼試試手
打包下載:
<ignore_js_op> AutoXpathTools.zip (76.32 KB, 下載次數: 0)
如果你感覺可以話就給我推薦一下吧。感謝大家