1. 程式人生 > >網路爬蟲(轉)

網路爬蟲(轉)

挺有意思的筆記~

Keep learning~

1. 製作新浪新聞網路爬蟲http://study.163.com/course/courseMain.htm?courseId=1003285002 利用chrome瀏覽器, 檢查,---Network--doc  重新載入  第一個 選擇檢查元素,查到對應的標籤 import requests from bs4 import BeautifulSoup newsurl='http://news.sina.com.cn/china/' res=requests.get(newsurl)    #用chrome檢視,知道是get方法,在此可以做各agent res.encoding='UTF-8'    #防止亂碼 ’ print res.text     #中途檢查結構 soup=BeautifulSoup(res.text,'html.parser')    #‘html.parser    for news in soup.select('.news-item'):    #取id標號後的內容時候 加 "#", class 標號後的內容的時候,加 "."     if len(news.select('h2'))>0:         h2= news.select('h2')[0].text         time=news.select('.time')[0].text         a=news.select('a')[0]['href']           #注意此處取的是連線,如果取內容直接text         print h2,a,time   ----------------------------------------------------------------------------------------

import requests from datetime import datetime from bs4 import BeautifulSoup newsurl='http://news.sina.com.cn/o/2017-05-13/doc-ifyfecvz1234039.shtml' res=requests.get(newsurl) res.encoding='UTF-8' soup=BeautifulSoup(res.text,'html.parser') bodyTitle=soup.select('#artibodyTitle')[0].text timesource=soup.select('.time-source')[0].contents[0].strip() #dt=datetime.strptime(timesource,'%Y%m%d%H:%M') source=soup.select('.time-source span a')[0].text article=[] for p in soup.select('#artibody p')[:-1]:     article.append(p.text.strip()) ''.join(article) ''.join([p.text.strip() for p in soup.select('#artibody p')[:-1]]) #soup.select('.article-editor')[0].text.lstrip('責任編輯')

-------------------------------------------------------------------------------------------------------------------------- 查詢js部分找到連線 import requests from bs4 import BeautifulSoup newsurl='http://comment5.news.sina.com.cn/page/info?version=1&format=js\ &channel=gn&newsid=comos-fyfecvz1234039&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20' comments=requests.get(newsurl) comments.encoding='UTF-8' import json jd=json.loads(comments.text.strip('var data=')) jd['result']['count']['total']

--------------------------------------------------------------------------------------------- newsurl='http://news.sina.com.cn/o/2017-05-13/doc-ifyfecvz1234039.shtml' newid=newsurl.split('/')[-1].rstrip('.shtml').lstrip('doc-i') print newid

import re m=re.search('doc-i(.*).shtml',newsurl) newsid=m.group(1)

----------------------------------------------------------------------------------------------------------- import re import json commentsurl='http://comment5.news.sina.com.cn/page/info?version=1&format=js\ &channel=gn&newsid=comos-{}&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20' def getCommentCounts(newsurl):     m=re.search('doc-i(.+).shtml',newsurl)     newsid=m.group(1)     comments=requests.get(commentsurl.format(newsid))  #把newsid套入commentsurl的{}     jd=json.loads(comments.text.strip('var data='))     return jd['result']['count']['total'] news='http://news.sina.com.cn/o/2017-05-13/doc-ifyfecvz1234039.shtml' getCommentCounts(news)

------------------------------------------------------------------------------ 最終結果

import requests from bs4 import BeautifulSoup def getNewsDetail(newsurl):     result={}     res=requests.get(newsurl)     res.encoding='utf-8'     soup=BeautifulSoup(res.text,'html.parser')     result['title']=soup.select('#artibodyTitle')[0].text     result['newssource']=soup.select('.time-source')[0].text     result['comments']=getCommentCounts(newsurl)     return result getNewsDetail('http://news.sina.com.cn/o/2017-05-13/doc-ifyfecvz1234039.shtml')

獲取內容 import urllib2 #直接請求 response=urllib2.urlopen('http://www.baidu.com') #獲取狀態碼,200表示成功 print response.getcode() cont=response.read()

------------------- import urllib2 url='http://www.baidu.com' request=urllib2.Request(url) request.add_data('a') request.add_header('User-Agent','Mozilla/5.0') response=urllib2.urlopen(request)

------------------------------------- import urllib2,cookielib #建立cookie容器 cj=cookielib.CookieJar() #建立1個opener opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) #給urllib2安裝opener urllib2.install_opener(opener) response=urllib2.urlopen('http://www.baidu.com') -------------------------------------------------------------------------------------------------------- from bs4 import BeautifulSoup import re html_doc =''' <html><head><title>The Dormouse's story</title></head> <body> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> ''' # 根據html建立BeautifulSoup物件 soup=BeautifulSoup(html_doc, #文件字串                    'html.parser', #解析器                   from_encoding='utf=8') links=soup.find_all('a') for link in links:     print link.name,link['href'],link.get_text() link_node=soup.find('a',href="http://example.com/lacie") print link_node.name,link['href'],link.get_text()         link_node=soup.find('a',href=re.compile(r'ill')) print link_node.name,link['href'],link.get_text() p_node=soup.find('p',class_='title')  #class是PYthon的關鍵詞,所以加_區分 print p_node.name,p_node.get_text()

---------------------------------------------------------------------------------------- 最終 Crawler_mian.py

import URLManager, HTMLDownloader, HTMLParser, HTMLOutputer class CrawlerMain(object):     def __init__(self):         self.urls = URLManager.UrlManager()                     # 初始化URL管理器         self.downloader = HTMLDownloader.HtmlDownloader()       # 初始化HTML下載器         self.parser = HTMLParser.HtmlParser()                   # 初始化HTML解析器         self.outputer = HTMLOutputer.HtmlOutputer()             # 初始化HTML輸出器         pass     def crawl(self,root_url):         count = 1        # 爬取計數         self.urls.add_new_url(root_url)                 # 將入口URL新增進管理器         while self.urls.has_new_url():                  # 若URL池不為空則進行爬取             try:                 new_url = self.urls.get_new_url()           # 獲取要下載的URL                 print('crawl %d: %s' % (count, new_url))    # 列印正在爬取第幾個頁面及其URL                 html_cont = self.downloader.download(new_url)      # 下載頁面                 new_urls, new_data = self.parser.hparse(new_url, html_cont)  # 獲取新的URL列表和頁面資料                 self.urls.add_new_urls(new_urls)            # 將新的URL列表新增進管理器                 self.outputer.collect_data(new_data)        # 收集資料                 if count == 10:                     break                 count = count + 1             except:                 print('Crawl Failed')         self.outputer.output_html()   # 將資料輸出為HTML         pass if __name__ == '__main__':     root_url = "http://baike.baidu.com/item/Python"         # 入口URL     obj_crawler = CrawlerMain()                             # 建立爬蟲例項     obj_crawler.crawl(root_url)                             # 呼叫爬蟲方法 ---

HTMLDownloader.py from urllib import request class HtmlDownloader(object):     def download(self, url):         if url is None:             return None         # 開啟網頁         response = request.urlopen(url)         if response.getcode() != 200:             # 開啟失敗返回None             return None         else:             # 開啟成功返回網頁內容             return response.read().decode("utf-8")---- URLManager.py class UrlManager(object):     def __init__(self):         self.new_urls = set()         self.old_urls = set()     def add_new_url(self, url):         if url is None:             return         if url not in self.new_urls and url not in self.old_urls:             # 如果該URL沒有被/新增訪問過就新增進管理器             self.new_urls.add(url)         pass     def add_new_urls(self, urls):         if urls is None or len(urls) == 0:             return         for url in urls:             # 將URL列表新增進待訪問佇列             self.new_urls.add(url)         pass     def has_new_url(self):         # 返回URL池是否為空         return len(self.new_urls) != 0         pass     def get_new_url(self):         # 從未訪問的URL中取出一個,並返回取出的URL         new_url = self.new_urls.pop()         self.old_urls.add(new_url)         return new_url         pass ------------ HTMLParser.pyfrom bs4 import BeautifulSoup import re from urllib import parse class HtmlParser(object):     # page_url為頁面URL, html_cont為獲取的頁面內容     def hparse(self, page_url, html_cont):         if page_url is None or html_cont is None:             return         # BeautifulSoup解析網頁內容         soup = BeautifulSoup(html_cont, 'html.parser')         # 獲取頁面內容包含的URLs         new_urls = self._get_new_urls(page_url, soup)         # 獲取頁面內容中想要爬取的資料         new_data = self._get_new_data(page_url, soup)         return new_urls, new_data         pass     def _get_new_urls(self, page_url, soup):         new_urls = set()         # 正則表示式模糊匹配         links = soup.find_all('a', href=re.compile(r"/item/"))         for link in links:             new_url = link['href']           # 連線成網址            new_full_url = parse.urljoin(page_url, new_url)              new_urls.add(new_full_url)        return new_urls    def _get_new_data(self, page_url, soup):        res_data = {}        # url        res_data['url'] = page_url        # <dd class="lemmaWgt-lemmaTitle-title"><h1>Python</h1>        title_node = soup.find('dd', class_ = "lemmaWgt-lemmaTitle-title").find("h1")        res_data['title'] = title_node.get_text()        # <div class="lemma-summary" label-module="lemmaSummary">        summary_node = soup.find('div', class_ = "lemma-summary" )        res_data['summary'] = summary_node.get_text()        return res_data        pass------------HTMLOutputer.pyclass HtmlOutputer(object):     def __init__(self):         self.datas = []     def collect_data(self, data):         if data is None:             return         self.datas.append(data)         pass     def output_html(self):         fout = open('output.html', 'w', encoding='utf-8')

        fout.write('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">')         fout.write("<html>")         fout.write("<body>")         fout.write("<table>")

        for data in self.datas:             fout.write("<tr>")             fout.write("<td>%s</td>" % data['url'])             fout.write("<td>%s</td>" % data['title'])             fout.write("<td>%s</td>" % data['summary'])             fout.write("</tr>")

        fout.write("</html>")         fout.write("</body>")         fout.write("</table>")         pass