Python今日頭條網爬蟲
阿新 • • 發佈:2019-02-02
encoding=utf8 import requests from requests.exceptions import RequestException import urllib import json import re from bs4 import BeautifulSoup from config import * def get_index(offset,keyword): data={ 'offset':offset, 'format':'json', 'keyword':keyword, 'autoload':'true', 'count':20, 'cur_tab':3 } url='http://www.toutiao.com/search_content/?'+urllib.urlencode(data) response=requests.get(url) try: if response.status_code == 200: return response.text return None except RequestException: print u'請求索引頁出錯' return None def parse_page_index(html): #將json格式的字串轉化成python物件,物件轉換成json用 json.dumps() data=json.loads(html) if data and 'data' in data.keys(): for item in data.get('data'): #yield 是url生成器 即取出article_url並生成url yield item.get('article_url') def get_page_detail(url): response = requests.get(url) try: if response.status_code == 200: return response.text return None except RequestException: print '請求詳情頁出錯' return None def parse_page_detail(html,url): soup=BeautifulSoup(html,'lxml') title = soup.select('title')[0].get_text() images_pattern= re.compile('var gallery = (.*?);',re.S) result = re.search(images_pattern,html) if result: data =json.loads(result.group(1)) sub_images = data.get('sub_images') images = [item.get('url') for item in sub_images] return { 'title' :title, 'url':url, 'images':images } def main(): html=get_index(0,'街拍') for url in parse_page_index(html): html=get_page_detail(url) if html: result=parse_page_detail(html,url) print result['title'] if name == 'main': main()