網路爬蟲(轉)
挺有意思的筆記~
Keep learning~
1. 製作新浪新聞網路爬蟲http://study.163.com/course/courseMain.htm?courseId=1003285002 利用chrome瀏覽器, 檢查,---Network--doc 重新載入 第一個 選擇檢查元素,查到對應的標籤 import requests from bs4 import BeautifulSoup newsurl='http://news.sina.com.cn/china/' res=requests.get(newsurl) #用chrome檢視,知道是get方法,在此可以做各agent res.encoding='UTF-8' #防止亂碼 ’ print res.text #中途檢查結構 soup=BeautifulSoup(res.text,'html.parser') #‘html.parser for news in soup.select('.news-item'): #取id標號後的內容時候 加 "#", class 標號後的內容的時候,加 "." if len(news.select('h2'))>0: h2= news.select('h2')[0].text time=news.select('.time')[0].text a=news.select('a')[0]['href'] #注意此處取的是連線,如果取內容直接text print h2,a,time ----------------------------------------------------------------------------------------
import requests from datetime import datetime from bs4 import BeautifulSoup newsurl='http://news.sina.com.cn/o/2017-05-13/doc-ifyfecvz1234039.shtml' res=requests.get(newsurl) res.encoding='UTF-8' soup=BeautifulSoup(res.text,'html.parser') bodyTitle=soup.select('#artibodyTitle')[0].text timesource=soup.select('.time-source')[0].contents[0].strip() #dt=datetime.strptime(timesource,'%Y%m%d%H:%M') source=soup.select('.time-source span a')[0].text article=[] for p in soup.select('#artibody p')[:-1]: article.append(p.text.strip()) ''.join(article) ''.join([p.text.strip() for p in soup.select('#artibody p')[:-1]]) #soup.select('.article-editor')[0].text.lstrip('責任編輯')
-------------------------------------------------------------------------------------------------------------------------- 查詢js部分找到連線 import requests from bs4 import BeautifulSoup newsurl='http://comment5.news.sina.com.cn/page/info?version=1&format=js\ &channel=gn&newsid=comos-fyfecvz1234039&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20' comments=requests.get(newsurl) comments.encoding='UTF-8' import json jd=json.loads(comments.text.strip('var data=')) jd['result']['count']['total']
--------------------------------------------------------------------------------------------- newsurl='http://news.sina.com.cn/o/2017-05-13/doc-ifyfecvz1234039.shtml' newid=newsurl.split('/')[-1].rstrip('.shtml').lstrip('doc-i') print newid
import re m=re.search('doc-i(.*).shtml',newsurl) newsid=m.group(1)
----------------------------------------------------------------------------------------------------------- import re import json commentsurl='http://comment5.news.sina.com.cn/page/info?version=1&format=js\ &channel=gn&newsid=comos-{}&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20' def getCommentCounts(newsurl): m=re.search('doc-i(.+).shtml',newsurl) newsid=m.group(1) comments=requests.get(commentsurl.format(newsid)) #把newsid套入commentsurl的{} jd=json.loads(comments.text.strip('var data=')) return jd['result']['count']['total'] news='http://news.sina.com.cn/o/2017-05-13/doc-ifyfecvz1234039.shtml' getCommentCounts(news)
------------------------------------------------------------------------------ 最終結果
import requests from bs4 import BeautifulSoup def getNewsDetail(newsurl): result={} res=requests.get(newsurl) res.encoding='utf-8' soup=BeautifulSoup(res.text,'html.parser') result['title']=soup.select('#artibodyTitle')[0].text result['newssource']=soup.select('.time-source')[0].text result['comments']=getCommentCounts(newsurl) return result getNewsDetail('http://news.sina.com.cn/o/2017-05-13/doc-ifyfecvz1234039.shtml')
獲取內容 import urllib2 #直接請求 response=urllib2.urlopen('http://www.baidu.com') #獲取狀態碼,200表示成功 print response.getcode() cont=response.read()
------------------- import urllib2 url='http://www.baidu.com' request=urllib2.Request(url) request.add_data('a') request.add_header('User-Agent','Mozilla/5.0') response=urllib2.urlopen(request)
------------------------------------- import urllib2,cookielib #建立cookie容器 cj=cookielib.CookieJar() #建立1個opener opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) #給urllib2安裝opener urllib2.install_opener(opener) response=urllib2.urlopen('http://www.baidu.com') -------------------------------------------------------------------------------------------------------- from bs4 import BeautifulSoup import re html_doc =''' <html><head><title>The Dormouse's story</title></head> <body> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> ''' # 根據html建立BeautifulSoup物件 soup=BeautifulSoup(html_doc, #文件字串 'html.parser', #解析器 from_encoding='utf=8') links=soup.find_all('a') for link in links: print link.name,link['href'],link.get_text() link_node=soup.find('a',href="http://example.com/lacie") print link_node.name,link['href'],link.get_text() link_node=soup.find('a',href=re.compile(r'ill')) print link_node.name,link['href'],link.get_text() p_node=soup.find('p',class_='title') #class是PYthon的關鍵詞,所以加_區分 print p_node.name,p_node.get_text()
---------------------------------------------------------------------------------------- 最終 Crawler_mian.py
import URLManager, HTMLDownloader, HTMLParser, HTMLOutputer class CrawlerMain(object): def __init__(self): self.urls = URLManager.UrlManager() # 初始化URL管理器 self.downloader = HTMLDownloader.HtmlDownloader() # 初始化HTML下載器 self.parser = HTMLParser.HtmlParser() # 初始化HTML解析器 self.outputer = HTMLOutputer.HtmlOutputer() # 初始化HTML輸出器 pass def crawl(self,root_url): count = 1 # 爬取計數 self.urls.add_new_url(root_url) # 將入口URL新增進管理器 while self.urls.has_new_url(): # 若URL池不為空則進行爬取 try: new_url = self.urls.get_new_url() # 獲取要下載的URL print('crawl %d: %s' % (count, new_url)) # 列印正在爬取第幾個頁面及其URL html_cont = self.downloader.download(new_url) # 下載頁面 new_urls, new_data = self.parser.hparse(new_url, html_cont) # 獲取新的URL列表和頁面資料 self.urls.add_new_urls(new_urls) # 將新的URL列表新增進管理器 self.outputer.collect_data(new_data) # 收集資料 if count == 10: break count = count + 1 except: print('Crawl Failed') self.outputer.output_html() # 將資料輸出為HTML pass if __name__ == '__main__': root_url = "http://baike.baidu.com/item/Python" # 入口URL obj_crawler = CrawlerMain() # 建立爬蟲例項 obj_crawler.crawl(root_url) # 呼叫爬蟲方法 ---
HTMLDownloader.py from urllib import request class HtmlDownloader(object): def download(self, url): if url is None: return None # 開啟網頁 response = request.urlopen(url) if response.getcode() != 200: # 開啟失敗返回None return None else: # 開啟成功返回網頁內容 return response.read().decode("utf-8")---- URLManager.py class UrlManager(object): def __init__(self): self.new_urls = set() self.old_urls = set() def add_new_url(self, url): if url is None: return if url not in self.new_urls and url not in self.old_urls: # 如果該URL沒有被/新增訪問過就新增進管理器 self.new_urls.add(url) pass def add_new_urls(self, urls): if urls is None or len(urls) == 0: return for url in urls: # 將URL列表新增進待訪問佇列 self.new_urls.add(url) pass def has_new_url(self): # 返回URL池是否為空 return len(self.new_urls) != 0 pass def get_new_url(self): # 從未訪問的URL中取出一個,並返回取出的URL new_url = self.new_urls.pop() self.old_urls.add(new_url) return new_url pass ------------ HTMLParser.pyfrom bs4 import BeautifulSoup import re from urllib import parse class HtmlParser(object): # page_url為頁面URL, html_cont為獲取的頁面內容 def hparse(self, page_url, html_cont): if page_url is None or html_cont is None: return # BeautifulSoup解析網頁內容 soup = BeautifulSoup(html_cont, 'html.parser') # 獲取頁面內容包含的URLs new_urls = self._get_new_urls(page_url, soup) # 獲取頁面內容中想要爬取的資料 new_data = self._get_new_data(page_url, soup) return new_urls, new_data pass def _get_new_urls(self, page_url, soup): new_urls = set() # 正則表示式模糊匹配 links = soup.find_all('a', href=re.compile(r"/item/")) for link in links: new_url = link['href'] # 連線成網址 new_full_url = parse.urljoin(page_url, new_url) new_urls.add(new_full_url) return new_urls def _get_new_data(self, page_url, soup): res_data = {} # url res_data['url'] = page_url # <dd class="lemmaWgt-lemmaTitle-title"><h1>Python</h1> title_node = soup.find('dd', class_ = "lemmaWgt-lemmaTitle-title").find("h1") res_data['title'] = title_node.get_text() # <div class="lemma-summary" label-module="lemmaSummary"> summary_node = soup.find('div', class_ = "lemma-summary" ) res_data['summary'] = summary_node.get_text() return res_data pass------------HTMLOutputer.pyclass HtmlOutputer(object): def __init__(self): self.datas = [] def collect_data(self, data): if data is None: return self.datas.append(data) pass def output_html(self): fout = open('output.html', 'w', encoding='utf-8')
fout.write('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">') fout.write("<html>") fout.write("<body>") fout.write("<table>")
for data in self.datas: fout.write("<tr>") fout.write("<td>%s</td>" % data['url']) fout.write("<td>%s</td>" % data['title']) fout.write("<td>%s</td>" % data['summary']) fout.write("</tr>")
fout.write("</html>") fout.write("</body>") fout.write("</table>") pass