1. 程式人生 > >STITP專案記錄Period3

STITP專案記錄Period3

這段時間主要寫出了針對CNN News China的爬蟲,可以爬下這個網站關於中國報道的標題,時間,url,以及正文。

import requests
from requests.exceptions import RequestException
import re
import time
from bs4 import BeautifulSoup
import lxml
import json


def get_one_page(url):
	try:
		headers = {
			'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
} response = requests.get(url, headers=headers) if response.status_code == 200: return response.text print('url:' + url + '\nWarning : status_code is not 200!') return None except RequestException: print('url:' + url + '\nWarning : we get an exception!') print(str(RequestException.message)
) return None def write_to_file(content): with open('result.txt', 'a', encoding='utf-8') as f: f.write(json.dumps(content, ensure_ascii=False) + '\n') def get_content(url, title): if str(url)[0] != 'h': url = 'http://edition.cnn.com' + url html = get_one_page(url) soup = BeautifulSoup(
html, 'html.parser') # print(soup.prettify()) date = soup.find(name='p', attrs={'class':'update-time'}).get_text() containers = soup.find_all(attrs={'class':'zn-body__paragraph speakable'}) content = "" for container in containers: content = content + container.get_text() print(url + ' done!') return date, content def parse_section_1(section_1): div = section_1.find(attrs={'class':'zn__containers'}) area_1 = div.find(attrs={'class':'column zn__column--idx-0'}) area_2 = div.find(attrs={'class':'column zn__column--idx-1'}) title = area_1.find(attrs={'class':'cd__headline-text'}).string url = area_1.find(name='a').attrs['href'] date, content = get_content(url, title) opinion = {'type':'opoinion', 'title':title, 'url':url, 'date':date, 'content':content} write_to_file(opinion) for article in area_2.find_all(name='article'): url = article['data-vr-contentbox'] title = article.find(attrs={'class':'cd__headline-text'}).string date, content = get_content(url, title) top_stories = {'type':'top stories', 'title':title, 'url':url, 'date':date, 'content':content} write_to_file(top_stories) # print('parse_section_1 done!') def parse_section_2(section_2): div = section_2.find(attrs={'class':'zn__containers'}) area_1 = div.find(attrs={'class':'column zn__column--idx-0'}) area_2 = div.find(attrs={'class':'column zn__column--idx-1'}) for article in area_1.find_all(name='article'): url = article['data-vr-contentbox'] title = article.find(attrs={'class':'cd__headline-text'}).string date, content = get_content(url, title) news = {'type':'news and buzz', 'title':title, 'url':url, 'date':date, 'content':content} write_to_file(news) title = area_2.find(name='span', attrs={'class':'cd__headline-text'}).string url = area_2.find(name='a').attrs['href'] date, content = get_content(url, title) in_depth = {'type':'in depth', 'title':title, 'url':url, 'date':date, 'content':content} write_to_file(in_depth) # print('parse_section_2 done!') def main(): url = 'https://edition.cnn.com/china' html = get_one_page(url) soup = BeautifulSoup(html, 'html.parser') # print(soup.prettify()) sections = soup.find_all(name='section') parse_section_1(sections[0]) parse_section_2(sections[1]) if __name__ == '__main__': main()

可惜的是,這份程式碼依然存在一些問題,就是不能爬取下單篇報道的所有正文。
只怪它的html程式碼太過複雜。。。
不水了,繼續改。