不和諧網站圖片抓取

阿新 • • 發佈：2019-02-01

using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; using System.Drawing; using System.Linq; using System.Text; using System.Windows.Forms; using System.Collections; using System.IO; using System.Text.RegularExpressions; using System.Web; using System.Net; using System.Xml; using mshtml; using System.Threading; //20100511 15：14 //加入以下引用是為了呼叫wininet模組，以修改IE代理選項。 using System.Runtime.InteropServices; using System.Diagnostics; using Microsoft.Win32; namespace ehentai { public partial class frmMain : Form { int documCounter = 0; int dCounter = 0; bool lockPage = true; string htmlCode = ""; ArrayList pages = new ArrayList(); ArrayList imgs = new ArrayList(); WebBrowser wb = new WebBrowser(); //20100511 15：15 //引用wininet，以修改IE代理選項。 [DllImport(@"wininet", SetLastError = true, CharSet = CharSet.Auto, EntryPoint = "InternetSetOption", CallingConvention = CallingConvention.StdCall)] public static extern bool InternetSetOption ( int hInternet, int dmOption, IntPtr lpBuffer, int dwBufferLength ); public static void SetProxy(string proxy) { //開啟登錄檔 RegistryKey regKey = Registry.CurrentUser; string SubKeyPath = @"Software/Microsoft/Windows/CurrentVersion/Internet Settings"; RegistryKey optionKey = regKey.OpenSubKey(SubKeyPath, true); //更改健值，設定代理， optionKey.SetValue("ProxyEnable", 1); optionKey.SetValue("ProxyServer", "socks=" + proxy); //啟用代理設定 InternetSetOption(0, 39, IntPtr.Zero, 0); InternetSetOption(0, 37, IntPtr.Zero, 0); } public frmMain() { InitializeComponent(); //wb.Navigated += new WebBrowserNavigatedEventHandler(wb_Navigated); //wb.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(wb_DocumentCompleted); //wbMain.ScriptErrorsSuppressed = true; //wb.ScriptErrorsSuppressed = true; showWb(); //SetProxy("127.0.0.1:8080"); } //20100605 顯示wb的控制元件主體。 private void showWb() { wb.Show(); wb.Visible = true; wb.BringToFront(); wb.Parent = splitContainer2.Controls[1]; wb.Dock = DockStyle.Fill; } void wb_Navigated(object sender, WebBrowserNavigatedEventArgs e) { dCounter += 1; } void wb_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e) { dCounter--; if (dCounter == 0) { timer1.Enabled = false; prosseImg(""); if (imgs.Count > 0) { //Thread.Sleep(10000); wb.Navigate(imgs[0].ToString()); timer1.Enabled = true; sendMsg("Navigating to image link:" + imgs[0].ToString()); imgs.RemoveAt(0); } else { if (pages.Count > 0) { wbMain.Navigate(pages[0].ToString()); sendMsg("Navigating to list link:" + pages[0].ToString()); pages.RemoveAt(0); } else { lockPage = true; timer1.Enabled = false; sendMsg("Done."); MessageBox.Show("Done."); } } } } private void wbMain_Navigated(object sender, WebBrowserNavigatedEventArgs e) { documCounter += 1; } private void wbMain_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e) { documCounter--; if (documCounter == 0) { prosseList(wbMain.DocumentText); } } private ArrayList SniffCode(string code, string wordsBegin, string wordsEnd) { ArrayList urlList = new ArrayList(); Regex regex1 = new Regex("" + wordsBegin + @"(?<title>[/s/S]+?)" + wordsEnd + "", RegexOptions.Compiled | RegexOptions.IgnoreCase); for (Match match1 = regex1.Match(code); match1.Success; match1 = match1.NextMatch()) { urlList.Add(match1.Groups["title"].ToString()); } return urlList; } private void prosseIndex(string htmlCode1) { htmlCode1 = htmlCode1.Replace("/n", ""); htmlCode1 = htmlCode1.Replace("/t", ""); htmlCode1 = htmlCode1.Replace("/"", ""); ArrayList it2s = SniffCode(htmlCode1, @"<div class=it2 id=", @" style="); for (int y = 0; y < it2s.Count;y++) { HtmlElement el1 = wb.Document.CreateElement(pic_it2(it2s[y].ToString())); MessageBox.Show(el1.GetAttribute(" mce_style="); for (int y = 0; y < it2s.Count;y++) { HtmlElement el1 = wb.Document.CreateElement(pic_it2(it2s[y].ToString())); MessageBox.Show(el1.GetAttribute("src")); } if (htmlCode == string.Empty) { //XmlTextWriter xmlWriter = new XmlTextWriter(System.DateTime.Now.ToString("yyyymmddHHmmss") + ".xml", System.Text.Encoding.UTF8); //xmlWriter.Formatting = Formatting.Indented;//每行加入縮排 //xmlWriter.WriteStartDocument();//建立一條<?xml version="1.0" encoding="utf-8" ?> //xmlWriter.WriteStartElement("collections");//建立根元素 htmlCode1 = htmlCode1.Replace("/n", ""); htmlCode1 = htmlCode1.Replace("/t", ""); htmlCode1 = htmlCode1.Replace("/"", ""); htmlCode1 = htmlCode1.Replace("gtr1", "gtr0"); ArrayList gtr = SniffCode(htmlCode1, @"<tr class=gtr0>", @"</tr>"); //gtr.AddRange(SniffCode(htmlCode, @"<tr class=gtr1>", @"</tr>")); if (gtr.Count > 0) { for (int i = 0; i < gtr.Count; i++) { wb.Document.Write(@gtr[i].ToString()); HtmlElementCollection els = wb.Document.All; foreach (HtmlElement el in els) { if (el.TagName.ToLower() == "img") { //xmlWriter.WriteElementString("tag", "img"); //lstMessage.Items.Add(el.GetAttribute("src"));//很奇怪，有時候會枚舉出縮圖的連結，但是有時候又沒有，難道是js沒有執行完？ //xmlWriter.WriteElementString("src", el.GetAttribute("src")); } if (el.TagName.ToLower() == "a") { //xmlWriter.WriteElementString("tag", "a"); //lstMessage.Items.Add(el.GetAttribute("href")); //xmlWriter.WriteElementString("href", el.GetAttribute("href")); } if (el.TagName.ToLower() == "div") { //xmlWriter.WriteElementString("tag", "div"); //lstMessage.Items.Add(el.GetAttribute("id")); //xmlWriter.WriteElementString("id", el.GetAttribute("id")); } if (el.TagName.ToLower() == "td") { if (el.InnerHtml != null) { lstMessage.Items.Add(el.InnerHtml); } } } lstMessage.Items.Add("=========================================="); //xmlWriter.WriteElementString("===", "========================"); } } else { MessageBox.Show("沒有相關內容！請重新設定搜尋條件！"); } //xmlWriter.WriteEndElement(); //xmlWriter.WriteEndDocument(); //xmlWriter.Flush(); //xmlWriter.Close(); } } private void prosseList(string htmlCode) { sendMsg("Prossing list page."); if (!Directory.Exists(@"./" + wbMain.Document.Title)) { Directory.CreateDirectory(@"./" + wbMain.Document.Title); sendMsg("Dir " + wbMain.Document.Title + "created."); } HtmlElementCollection gec = wbMain.Document.GetElementsByTagName("a"); imgs.Clear(); sendMsg("Looking for links with class='noul'."); foreach (HtmlElement he in gec) { if (he.GetAttribute("className") == "noul") { string url = he.GetAttribute("href"); if (url.Length > 50) { imgs.Add(url); } else { if (!pages.Contains(url) && !lockPage) pages.Add(url); } } } sendMsg("Got " + pages.Count.ToString() + "page links and" + imgs.Count.ToString() + "image links."); lockPage = true; //wbMain.Navigate(pages[0].ToString()); //sendMsg("Navigating to list link:" + pages[0].ToString()); //pages.RemoveAt(0); wb.Navigate(imgs[0].ToString()); sendMsg("Navigating to image link:" + imgs[0].ToString()); imgs.RemoveAt(0); } private void prosseImg(string url) { try { sendMsg("Prossing image page."); HtmlElementCollection hec = wb.Document.Links; sendMsg("Looping."); foreach (HtmlElement he in hec) { string imgUrl = he.GetAttribute("href"); if (imgUrl.EndsWith("jpg") || imgUrl.EndsWith("png") || imgUrl.EndsWith("bmp") || imgUrl.EndsWith("jpeg")) { string fileName = imgUrl.Split(new char[] { '/' })[7]; sendMsg("Got file name:" + fileName); if (!File.Exists(@"./" + wbMain.Document.Title + @"/" + fileName)) { IHTMLControlElement img = GetImgTag(he.GetAttribute("href"), fileName); sendMsg("Saving file."); if (img != null) { IHTMLControlRange rang = (IHTMLControlRange)((HTMLBody)((HTMLDocument)wb.Document.DomDocument).body).createControlRange(); rang.add(img); rang.execCommand("Copy", false, null); Image numImage = Clipboard.GetImage(); Clipboard.Clear(); numImage.Save(@"./" + wbMain.Document.Title + @"/" + fileName); sendMsg("Got one!"); } } } } } catch (Exception eee) { sendMsg(eee.Message); } } private IHTMLControlElement GetImgTag(string src,string fileName) { sendMsg("Looking for img tag."); IHTMLControlElement img = null; HtmlElementCollection hec = wb.Document.Images; sendMsg("Looping."); foreach (HtmlElement he in hec) { string imgSrc = he.GetAttribute("src"); string imgAlt = he.GetAttribute("alt"); if ((imgSrc.EndsWith("jpg") || imgSrc.EndsWith("png") || imgSrc.EndsWith("bmp") || imgSrc.EndsWith("jpeg")) && (imgAlt == fileName)) img = (IHTMLControlElement)he.DomElement; } return img; } private string pic_it2(string id) { mshtml.IHTMLDocument2 currentDoc = (mshtml.IHTMLDocument2)wbMain.Document.DomDocument; mshtml.IHTMLWindow2 win = (mshtml.IHTMLWindow2)currentDoc.parentWindow; win.execScript("load_pane_image(document.getElementById('" + id + "'));", "javascript"); HtmlElementCollection el = wbMain.Document.All; return el[id].InnerHtml; } private void sendMsg(string msg) { lstMessage.Items.Add(msg); lstMessage.SetSelected(lstMessage.Items.Count - 1,true); } private void timer1_Tick(object sender, EventArgs e) { sendMsg("Time out."); dCounter = 1; wb.Stop(); timer1.Enabled = false; } private void btnSearch_Click_1(object sender, EventArgs e) { lockPage = false; sendMsg("Start."); sendMsg("Navigating to list link:" + txtSearch.Text); wbMain.Navigate(txtSearch.Text); } private void frmMain_FormClosing(object sender, FormClosingEventArgs e) { //RegistryKey regKey = Registry.CurrentUser; //string SubKeyPath = @"Software/Microsoft/Windows/CurrentVersion/Internet Settings"; //RegistryKey optionKey = regKey.OpenSubKey(SubKeyPath, true); //更改健值，設定代理， //optionKey.SetValue("ProxyEnable", 0); } } }

不和諧網站的圖片抓取程式，完全是堆程式碼的，而且因為沒有使用WebRequest一類的東西，效率比較低，還經常容易抓不到，算是練手。

不和諧網站圖片抓取

不和諧網站圖片抓取

爬蟲：實現網站的全部圖片抓取

python 圖片抓取

python網絡數據抓取二（bing圖片抓取）

arpspoof+driftnet+ ARP欺騙簡單圖片抓取

基於類的Python多求職網站資訊抓取！

Python爬蟲實戰專案2 | 動態網站的抓取（爬取電影網站的資訊）

用Python BeautifulSoup寫的一份多執行緒圖片抓取的指令碼

Python爬蟲入門教程 18-100 煎蛋網XXOO圖片抓取

將遠端圖片抓取到本地

讓你營養跟不上的圖片爬取

百度圖片咋這麼多不和諧的圖片？？

python爬蟲實戰---今日頭條的圖片抓取

被懲罰的網站抓取不會減少BGP

ajax抓取網站接口圖片瀑布流筆記

為何大量網站不能抓取?爬蟲突破封禁的6種常見方法 - 轉載

picturebox加載圖片的三種方法與網站驗證碼的抓取

使用python抓取網站圖片，下載到本地

綜合使用python爬蟲技術，selenium模組動態抓取“視覺中國”網站上的圖片的url

為何大量網站不能抓取?爬蟲突破封禁的6種常見方法

不和諧網站圖片抓取

相關推薦