一個完整的python大作業
阿新 • • 發佈:2017-11-02
off pytho tle code rate odin 制作 with wid
由於能選擇一個感興趣的網站進行數據分析,所以這次選擇爬取的網站是新華網,其網址為"http://www.xinhuanet.com/",然後對其進行數據分析並生成詞雲
運行整個程序相關的代碼包
import requests import re from bs4 import BeautifulSoup from datetime import datetime import pandas import sqlite3 import jieba from wordcloud import WordCloud import matplotlib.pyplot as plt
爬取網頁信息
url = "http://www.xinhuanet.com/" f=open("css.txt","w+") res0 = requests.get(url) res0.encoding="utf-8" soup = BeautifulSoup(res0.text,"html.parser") newsgroup=[] for news in soup.select("li"): if len(news.select("a"))>0: print(news.select("a")[0].text) title=news.select("a")[0].text f.write(title) f.close()
存入txt文件中,並進行字詞統計
f0 = open(‘css.txt‘,‘r‘) qz=[] qz=f0.read() f0.close() print(qz) words = list(jieba.cut(qz)) ul={‘:‘,‘的‘,‘"‘,‘、‘,‘”‘,‘“‘,‘。‘,‘!‘,‘:‘,‘?‘,‘ ‘,‘\u3000‘,‘,‘,‘\n‘} dic={} keys = set(words)-ul for i in keys: dic[i]=words.count(i) c = list(dic.items()) c.sort(key=lambda x:x[1],reverse=True) f1= open(‘diectory.txt‘,‘w‘) for i in range(10): print(c[i]) for words_count in range(c[i][1]): f1.write(c[i][0]+‘ ‘) f1.close()
存入數據庫
df = pandas.DataFrame(words) print(df.head()) with sqlite3.connect(‘newsdb3.sqlite‘) as db: df.to_sql(‘newsdb3‘,con = db)
制作詞雲
f3 = open(‘diectory.txt‘,‘r‘) cy_file = f3.read() f3.close() cy = WordCloud().generate(cy_file) plt.imshow(cy) plt.axis("off") plt.show()
最終成果
完整代碼
import requests import re from bs4 import BeautifulSoup from datetime import datetime import pandas import sqlite3 import jieba from wordcloud import WordCloud import matplotlib.pyplot as plt url = "http://www.xinhuanet.com/" f=open("css.txt","w+") res0 = requests.get(url) res0.encoding="utf-8" soup = BeautifulSoup(res0.text,"html.parser") newsgroup=[] for news in soup.select("li"): if len(news.select("a"))>0: print(news.select("a")[0].text) title=news.select("a")[0].text f.write(title) f.close() f0 = open(‘css.txt‘,‘r‘) qz=[] qz=f0.read() f0.close() print(qz) words = list(jieba.cut(qz)) ul={‘:‘,‘的‘,‘"‘,‘、‘,‘”‘,‘“‘,‘。‘,‘!‘,‘:‘,‘?‘,‘ ‘,‘\u3000‘,‘,‘,‘\n‘} dic={} keys = set(words)-ul for i in keys: dic[i]=words.count(i) c = list(dic.items()) c.sort(key=lambda x:x[1],reverse=True) f1 = open(‘diectory.txt‘,‘w‘) for i in range(10): print(c[i]) for words_count in range(c[i][1]): f1.write(c[i][0]+‘ ‘) f1.close() df = pandas.DataFrame(words) print(df.head()) with sqlite3.connect(‘newsdb3.sqlite‘) as db: df.to_sql(‘newsdb3‘,con = db) f3 = open(‘diectory.txt‘,‘r‘) cy_file = f3.read() f3.close() cy = WordCloud().generate(cy_file) plt.imshow(cy) plt.axis("off") plt.show()
一個完整的python大作業