C# 百度搜索結果xpath分析

阿新 • • 發佈：2017-05-24

als 接收數據 har rim resp inner ets webclient containe

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading.Tasks;
using HtmlAgilityPack;
namespace xpathGet
{
    class Program
    {
        #region      webclient創建

        public class WebClientBD : System.Net.WebClient
        {
            protected override System.Net.WebRequest GetWebRequest(Uri address)
            {

                HttpWebRequest request = base.GetWebRequest(address) as HttpWebRequest;
                request.AllowAutoRedirect = false;
                request.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip;
                request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36";
                return request;
            }

            public WebResponse Response { get; private set; }

            protected override WebResponse GetWebResponse(WebRequest request)
            {
                try
                {
                    this.Response = base.GetWebResponse(request);
                }
                catch { }
                return this.Response;

            }

        }
        public static string lastUrl(string url)
        {

            byte[] pageData = null;

            string lasturl = null;
            string redirectLocal = null;
            try
            {


                WebClientBD wc = new WebClientBD(); // 創建WebClient實例提供向URI 標識的資源發送數據和從URI 標識的資源接收數據
                wc.Credentials = CredentialCache.DefaultCredentials; // 獲取或設置用於對向 Internet 資源的請求進行身份驗證的網絡憑據。 
                Encoding enc = Encoding.GetEncoding("utf-8"); // 如果是亂碼就改成 utf-8 / GB2312  

                pageData = wc.DownloadData(url); // 從資源下載數據並返回字節數組。                

                if ((wc.Response as HttpWebResponse).StatusCode == HttpStatusCode.Found)
                {
                    redirectLocal = ((wc.Response as HttpWebResponse).Headers["location"].StartsWith("http") == true ? string.Empty : "http://www.baidu.com") + (wc.Response as HttpWebResponse).Headers["location"];
                    wc = new WebClientBD(); // 創建WebClient實例提供向URI 標識的資源發送數據和從URI 標識的資源接收數據
                    wc.Credentials = CredentialCache.DefaultCredentials; // 獲取或設置用於對向 Internet 資源的請求進行身份驗證的網絡憑據。 
                    pageData = wc.DownloadData(redirectLocal);
                    if ((wc.Response as HttpWebResponse).StatusCode == HttpStatusCode.Found)
                    {
                        lasturl = (wc.Response as HttpWebResponse).Headers["location"];
                    }
                    else if ((wc.Response as HttpWebResponse).StatusCode == HttpStatusCode.OK)
                    {
                        lasturl = redirectLocal;
                    }
                }
                return lasturl;

            }
            catch (Exception ex)
            {

                return "error:" + ex.Message;
            }
        }
        #endregion
        public static string GetHtmlSource(string url)
        {

            WebClientBD wc = new WebClientBD(); // 創建WebClient實例提供向URI 標識的資源發送數據和從URI 標識的資源接收數據
            wc.Credentials = CredentialCache.DefaultCredentials; // 獲取或設置用於對向 Internet 資源的請求進行身份驗證的網絡憑據。 
            Encoding enc = Encoding.GetEncoding("utf-8"); // 如果是亂碼就改成 utf-8 / GB2312  
            var pageData = wc.DownloadData(url); // 從資源下載數據並返回字節數組。   
            return enc.GetString(pageData);
        }

        static void Main(string[] args)
        {
            #region  翻頁URL
            //http://www.baidu.com/s?wd={0}&pn={1}&oq={0}&ie=utf-8&usm=4
            #endregion
            string address = "http://www.baidu.com/s?wd={0}&pn={1}0&oq={0}&ie=utf-8&usm=4";
            string key = "代購";
            string htmlPageSource = string.Empty;

            List<string> 其他最後的List = new List<string>();
            List<string> 廣告最後的List = new List<string>();

            string 廣告 = "//div[@id=‘content_left‘]//div[contains(@id,‘300‘)]{0}|//div[@id=‘content_left‘]//div[contains(@id,‘400‘)]{1}";
            string 其他 = "//div[@id=‘content_left‘]//div[contains(@class,‘c-container‘)]{0}";
            //取標題                      

            for (int pnIndex = 0; pnIndex < 5; pnIndex++)
            {
                HtmlDocument doc = new HtmlDocument();
                List<string> 其他標題List = new List<string>();
                List<string> 其他標題鏈接List = new List<string>();
                List<string> 其他內容List = new List<string>();
                List<string> 其他citeList = new List<string>();
                List<string> 廣告標題List = new List<string>();
                List<string> 廣告標題鏈接List = new List<string>();
                List<string> 廣告內容List = new List<string>();
                List<string> 廣告citeList = new List<string>();
                htmlPageSource = GetHtmlSource(string.Format(address, key, pnIndex.ToString()));
                doc.LoadHtml(htmlPageSource);
               // doc.Load("D:\\rootInfo.html", Encoding.UTF8);
                HtmlNodeCollection 廣告標題 = doc.DocumentNode.SelectNodes(string.Format(廣告, "/div[1]/h3/a[1]", "/div[1]/h3/a[1]"));
                HtmlNodeCollection 廣告標題鏈接 = doc.DocumentNode.SelectNodes(string.Format(廣告, "/div[1]/h3/a[1]", "/div[1]/h3/a[1]"));
                HtmlNodeCollection 廣告內容 = doc.DocumentNode.SelectNodes(string.Format(廣告, "/div[2]", "/div[2]"));
                HtmlNodeCollection 廣告cite = doc.DocumentNode.SelectNodes(string.Format(廣告, "/div[2]//a/span[1]", "/div[3]/a/span"));
                HtmlNodeCollection 其他標題 = doc.DocumentNode.SelectNodes(string.Format(其他, "/h3/a[1]"));
                HtmlNodeCollection 其他標題鏈接 = doc.DocumentNode.SelectNodes(string.Format(其他, "/h3/a[1]"));
                HtmlNodeCollection 其他內容 = doc.DocumentNode.SelectNodes(string.Format(其他, "//div[@class=‘c-abstract‘]") + "|" + string.Format(其他, "//div[‘c-span18 c-span-last‘]/p[1]") + "|" + string.Format(其他, "//div[@class=‘c-offset‘]") + "|" + string.Format(其他, "//div[@class=‘op_dict_content‘]") + "|" + string.Format(其他, "//p[contains(text(),‘由於該網站的robots.txt文件存在限制指令‘)]"));
                HtmlNodeCollection 其他cite = doc.DocumentNode.SelectNodes(string.Format(其他, "//span[@class=‘c-showurl‘]") + "|" + string.Format(其他, "//a[@class=‘c-showurl‘]"));
                //分析每個結果都有一個標題，現在是最新相關信息的結果沒有cite，判斷哪個沒有cite給其賦值“new info”
                //如果有最新相關信息的結果 ，假設標題有9個結果，則cite有8個。
                // 1.如果最新消息在最後一個此時標題的index=8，cite的index=8,最後一個標題index=9，但是cite的index是不存在的所以添加一個元素“new info。其他位置則插入元素


                foreach (var item in 其他cite)
                {
                    其他citeList.Add(item.InnerText.Trim().Replace(" ", String.Empty).Replace("\n", string.Empty).Replace(" ", string.Empty));
                }

                foreach (var item in 其他內容)
                {
                    其他內容List.Add(item.InnerText.Trim().Replace(" ", String.Empty).Replace("\n", string.Empty).Replace(" ", string.Empty));

                }

                for (int i = 0; i < 其他標題.Count; i++)
                {
                    其他標題List.Add(其他標題[i].InnerText.Trim().Replace(" ", String.Empty).Replace("\n", string.Empty).Replace(" ", string.Empty));
                    其他標題鏈接List.Add(其他標題鏈接[i].GetAttributeValue("href", "").Trim().Replace(" ", String.Empty).Replace("\n", string.Empty).Replace(" ", string.Empty));
                }

                if (其他標題List.Count != 其他citeList.Count)
                {
                    if (其他標題[其他citeList.Count].InnerText.Contains("的最新相關信息"))
                    {
                        其他citeList.Add("new info");
                    }
                    else
                    {

                        for (int i = 0; i < 其他標題List.Count; i++)
                        {
                            if (其他標題List[i].Contains("的最新相關信息"))
                            {
                                其他citeList.Insert(i, "new info");
                            }
                        }
                    }
                }
                //   List<string> 其他最後的List = new List<string>();


                for (int j = 0; j < 其他標題List.Count; j++)
                {
                    其他最後的List.Add(其他標題List[j] + "|" + 其他標題鏈接List[j] + "|" + 其他內容List[j] + "|" + 其他citeList[j] + "\t");
                }
                其他最後的List.Add(String.Format("以上為第{0}頁搜索結果。", pnIndex + 1));
                string path = @"d:\\infolist_Page" + (pnIndex+1) + ".html";
                File.WriteAllText(path, htmlPageSource, Encoding.UTF8);

                for (int i = 0; i < 廣告標題.Count; i++)
                {
                    廣告標題List.Add(廣告標題[i].InnerText.Trim().Replace(" ", String.Empty).Replace("\n", string.Empty).Replace(" ", string.Empty));
                    廣告內容List.Add(廣告內容[i].InnerText.Trim().Replace(" ", String.Empty).Replace("\n", string.Empty).Replace(" ", string.Empty));
                    廣告標題鏈接List.Add(廣告標題鏈接[i].GetAttributeValue("href", "").Trim().Replace(" ", String.Empty).Replace("\n", string.Empty).Replace(" ", string.Empty));
                    廣告citeList.Add(廣告cite[i].InnerText.Trim().Replace(" ", String.Empty).Replace("\n", string.Empty).Replace(" ", string.Empty));
                }
                for (int j = 0; j < 廣告標題List.Count; j++)
                {
                    廣告最後的List.Add(廣告標題List[j] + "|" + 廣告標題鏈接List[j] + "|" + 廣告內容List[j] + "|" + 廣告citeList[j] + "\t");
                }
                廣告最後的List.Add(String.Format("以上為第{0}頁搜索結果。", pnIndex + 1));
            }

            File.WriteAllLines(@"d:\\infolist.txt", 其他最後的List.ToArray(), Encoding.UTF8);
            File.WriteAllLines(@"d:\\infolist2.txt", 廣告最後的List.ToArray(), Encoding.UTF8);
        }
    }
}

C# 百度搜索結果xpath分析

als 接收數據 har rim resp inner ets webclient containe using System; using System.Collections.Generic; using System.IO; using System.Linq; u

百度搜索結果HTML分析

lpar 查找需求搜索結果格式化工具 all AI tom www 目的：為了從搜索結果中提取所有網頁，以備後續處理。訪問百度鏈接分析名稱值說明 wd 任意文字關鍵字 rn 可以不指定，默認為10，最大為50，最小為1，可設置為任意值一頁包

PHP網路爬蟲實踐：抓取百度搜索結果，並分析資料結構

百度的搜尋引擎有反爬蟲機制，我先直接用guzzle試試水。程式碼如下： <?php /** * Created by Benjiemin * Date: 2020/3/5 * Time: 14:58 */ require ('./vendor/autoload.php'); use QL\Qu

python爬取百度搜索結果ur匯總

百度搜索 sta attr amp end rom range 百度篩選寫了兩篇之後，我覺得關於爬蟲，重點還是分析過程分析些什麽呢： 1）首先明確自己要爬取的目標　　比如這次我們需要爬取的是使用百度搜索之後所有出來的url結果 2）分析手動進行的獲取目標的過程，以便

利用百度搜索結果爬取郵箱

.... sheet pro 編輯部 pic exception exc gecko 正則表達幫同學做一個關於爬取教授郵箱的任務，在百度搜索中輸入教授的名字+長江學者+郵箱，爬取並篩選每個教授的郵箱，最後把郵箱信息寫入到Excel表中：--爬取結果爭取率大概在50%-60

selenium-webdriver循環點擊百度搜索結果以及獲取新頁面的handler

pre Coding 之前 ref port 圖片自動化測試自動頁面　　webdriver還是很有意思的，之前用過Ruby的watir的自動化測試框架，感覺selenium的這套框架更好一些，很容易就可以上手。我雖然不做自動化這塊，不過先玩玩再說，多學點東西

python3 學習2（分頁翻看百度搜索結果）

# -*- coding: utf-8 -*- from selenium import webdriver import time if __name__ == "__main__": driver = webdriver.Chrome()

如何讓百度搜索結果顯示網站 logo

很多人都有用百度搜索自己想要的東西，例如想學習做網站的人會在百度上搜索“學做網站”，從而獲得符合自己需要的內容。我們在使用百度搜索結果看到，有的搜尋結果有一張LOGO圖片，如上圖，而有的卻沒有這張圖片。在搜尋結果中顯示站點LOGO，可以有效的提高使用者的點選率，對網站品牌的建設更有利。那

HttpClient 實現爬取百度搜索結果（自動翻頁）

如果你對HttpClient還不是很瞭解，建議先移步我的另一篇部落格HttpClient4.x之請求示例後再來看這篇部落格。我們這裡的專案採用maven搭建。在閱讀前要對jdk和maven有一定的瞭解。另外開發工具這裡我這裡使用的是：Spring Tool Suite（STS）當然你也可以使用其

對百度搜索法的分析評價

使用者介面：好處：它會為你推薦實時的熱點和很多功能按鈕，比如新聞，天氣，地圖等。手機版還可以語音搜尋和圖片搜尋。還有網頁導航可以帶你去很多自己感興趣的網頁。壞處：當你開啟一個熱點時，會有特別多的廣告，廣告還有強制你點開，或者關不掉，也有很多的標題黨與事實不符，和搜狗相比，百度更加嚴肅，

百度搜索結果爬蟲

程式碼如下 import requests from lxml import etree # 抓取整個頁面 words = input("輸入搜尋內容：") headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64;

PHP多程序抓取百度搜索結果

<?php /** * 多程序抓取百度結果頁自然結果，包括標題、摘要、圖片、連結、來源 * @since 2016-04-15 */ class NaturalResultSpider { private $_strQuery = null; pub

pyhon3爬取百度搜索結果

前不久為了在群裡鬥圖，想多蒐集點表情包學習了一下python爬蟲，蒐集了一萬多張吧。下載太多，完全不知道有什麼圖，還是鬥不過！！！！！今天又想爬取百度的搜尋結果，本人還是小白，怕忘記記錄一下，望大神賜教指正同樣是以爬取圖片為例，還很簡陋，沒什麼實用價值

如何刪除百度搜索結果_如何刪除百度快照

　不想讓網民看到的資訊如何從百度刪除呢？即如何刪除百度搜索結果，不止網站管理員關心，很多普通的網民也非常關注。今天有請優就業SEO研究院院長吳秀龍給大家分享一下刪除百度搜索結果的方法，裡面有需要大家注意的細節哦。　　百度一下，發現搜尋結果中有涉及個人隱私、企業商業機密或其它不

百度搜索結果屏蔽百家號方法

blog color log 方法 tps 百度搜索 water term nag 在搜索欄輸入關鍵字 + ‘-baijiahao‘, 即可屏蔽百家號內容，如：百度搜索結果屏蔽百家號方法

Python+selenium+PhantomJS獲取百度搜索結果真實連結地址

百度搜索結果如何屏蔽百家號內容

image put set amp www 彈出網址 cheng inf 瀏覽器訪問chrome://settings/searchEngines頁面我用的是360極速瀏覽器，彈出以下頁面把默認的百度搜索網址改成 https://www.baidu.com/#ie=

python采集百度搜索結果帶有特定URL的鏈接

desc while __init__ self. stat [] 百度 __main__ odin #coding utf-8 import requests from bs4 import BeautifulSoup as bs import re from

【數據分析】python分析百度搜索關鍵詞的頻率

爬蟲自動化數據分析 python 基礎涉及知識點 1、抓取數據 2、分頁爬蟲規律分析1、抓取數據，發現每一項都是data-tools標簽2、分頁分析代碼import requests from bs4 import BeautifulSoup import re impo

百度搜索法的評價分析

使用者介面好處：現代有一句話叫有問題問度娘，足以見出百度現在在人們心中的地位，在百度中可以查到大部分你想要的，並且實時更新資料，並推薦實時熱點給你壞處：有時候推薦的東西沒有營養，廣告應該提高門檻記住使用者選擇好處：記住你所感興趣的，不用你再一次搜尋，這樣可以

C# 百度搜索結果xpath分析

相關推薦