1. 程式人生 > 實用技巧 >爬蟲系列 一次採集.NET WebForm網站的坎坷歷程

爬蟲系列 一次採集.NET WebForm網站的坎坷歷程

public static string Get(string url, Action<string> SuccessCallback, Action<string> FailCallback) {
            HttpWebRequest req = WebRequest.Create(url) as HttpWebRequest;
            req.Method = "GET";
            req.UserAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36";
            req.Accept = "*/*";
            req.KeepAlive = true;
            req.ServicePoint.ConnectionLimit = int.MaxValue;
            req.ServicePoint.Expect100Continue = false;
            req.CookieContainer = sznyCookie; #靜態變數
            req.Credentials = System.Net.CredentialCache.DefaultCredentials;
            string msg = "";
            using (HttpWebResponse rsp = req.GetResponse() as HttpWebResponse)
            {
                using (StreamReader reader = new StreamReader(rsp.GetResponseStream()))
                {
                    msg = reader.ReadToEnd();
                }
            }
            return msg;
        }
 
    public static string Post(string url, Dictionary<string, string> dicParms, Action<string> SuccessCallback, Action<string> FailCallback) {
            StringBuilder data = new StringBuilder();
            foreach (var kv in dicParms) {
                if (kv.Key.StartsWith("header"))
                    continue;
                data.Append($"&{Common.UrlEncode( kv.Key,Encoding.UTF8)}={ Common.UrlEncode( kv.Value,Encoding.UTF8)}");
            }
            if (data.Length > 0)
                data.Remove(0, 1);
            HttpWebRequest req = WebRequest.Create(url) as HttpWebRequest;
            req.Method = "POST";
            req.KeepAlive = true;
            req.CookieContainer = sznyCookie;
            req.Connection = "KeepAlive";
            req.KeepAlive = true;
            req.ContentType = "application/x-www-form-urlencoded";
            req.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9";
            req.Referer = url;
            if (dicParms.ContainsKey("ScriptManager1"))
            {
                req.Headers.Add("X-MicrosoftAjax", "Delta=true");
                req.Headers.Add("X-Requested-With", "XMLHttpRequest");
                req.ContentType = "application/x-www-form-urlencoded; charset=UTF-8";
                req.Accept = "*/*";
            }
            req.Headers.Add("Cache-Control", "no-cache");
            
            req.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36";
            req.ServicePoint.ConnectionLimit = int.MaxValue;
            req.ServicePoint.Expect100Continue = false;
            req.AllowAutoRedirect = true;
            req.Credentials = System.Net.CredentialCache.DefaultCredentials;

            byte[] buffer = Encoding.UTF8.GetBytes(data.ToString());
            using (Stream reqStream = req.GetRequestStream())
            {
                reqStream.Write(buffer, 0, buffer.Length);
            }
            string msg = "";
            using (HttpWebResponse rsp = req.GetResponse() as HttpWebResponse)
            {
                using (StreamReader reader = new StreamReader(rsp.GetResponseStream()))
                {
                    
                    msg = reader.ReadToEnd();
                    if (msg.Contains("images/dl.jpg") || msg.Contains("pageRedirect||%2flogin.aspx"))
                    {
                        //登入失敗
                        if (FailCallback != null)
                            FailCallback(msg);
                    }
                    else {
                        if (SuccessCallback!=null)
                            SuccessCallback(msg);
                    }
                }
            }
            return msg;
        }

整個過程分為登陸、使用者資訊列表、使用者資訊詳情,分三步走來完成這個專案

登陸

根據Chrome抓包結果編寫Login,帳號密碼沒有任何加密,直接明文顯示了,直接用了,根據是否跳轉頁面判斷是否登陸成功。除錯檢視結果登陸成功了。