1. 程式人生 > >一個完整的python大作業

一個完整的python大作業

off pytho tle code rate odin 制作 with wid

由於能選擇一個感興趣的網站進行數據分析,所以這次選擇爬取的網站是新華網,其網址為"http://www.xinhuanet.com/",然後對其進行數據分析並生成詞雲

技術分享

運行整個程序相關的代碼包

import requests
import re
from bs4 import BeautifulSoup
from datetime import datetime
import pandas
import sqlite3
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt

爬取網頁信息

url = "
http://www.xinhuanet.com/" f=open("css.txt","w+") res0 = requests.get(url) res0.encoding="utf-8" soup = BeautifulSoup(res0.text,"html.parser") newsgroup=[] for news in soup.select("li"): if len(news.select("a"))>0: print(news.select("a")[0].text) title=news.select("a")[0].text f.write(title) f.close()

存入txt文件中,並進行字詞統計

f0 = open(css.txt,r)
qz=[]
qz=f0.read()
f0.close()
print(qz)

words = list(jieba.cut(qz))

ul={:,,",,,,,,,, ,\u3000,,\n}
dic={}

keys = set(words)-ul
for i in keys:
    dic[i]=words.count(i)

c = list(dic.items())
c.sort(key=lambda x:x[1],reverse=True)

f1 
= open(diectory.txt,w) for i in range(10): print(c[i]) for words_count in range(c[i][1]): f1.write(c[i][0]+ ) f1.close()

存入數據庫

df = pandas.DataFrame(words)

print(df.head())

with sqlite3.connect(newsdb3.sqlite) as db:

    df.to_sql(newsdb3,con = db)

制作詞雲

f3 = open(diectory.txt,r)
cy_file = f3.read()
f3.close()
cy = WordCloud().generate(cy_file)
plt.imshow(cy)
plt.axis("off")
plt.show()

最終成果

技術分享

技術分享

技術分享

技術分享

完整代碼

import requests
import re
from bs4 import BeautifulSoup
from datetime import datetime
import pandas
import sqlite3
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt


url = "http://www.xinhuanet.com/"

    

f=open("css.txt","w+")
res0 = requests.get(url)
res0.encoding="utf-8"
soup = BeautifulSoup(res0.text,"html.parser")
newsgroup=[]
for news in soup.select("li"):
    if len(news.select("a"))>0:
        print(news.select("a")[0].text)
        title=news.select("a")[0].text
        f.write(title)
f.close()

f0 = open(css.txt,r)
qz=[]
qz=f0.read()
f0.close()
print(qz)

words = list(jieba.cut(qz))

ul={:,,",,,,,,,, ,\u3000,,\n}
dic={}

keys = set(words)-ul
for i in keys:
    dic[i]=words.count(i)

c = list(dic.items())
c.sort(key=lambda x:x[1],reverse=True)

f1 = open(diectory.txt,w)
for i in range(10):
    print(c[i])
    for words_count in range(c[i][1]):
        f1.write(c[i][0]+ )
f1.close()

df = pandas.DataFrame(words)

print(df.head())

with sqlite3.connect(newsdb3.sqlite) as db:

    df.to_sql(newsdb3,con = db)


f3 = open(diectory.txt,r)
cy_file = f3.read()
f3.close()
cy = WordCloud().generate(cy_file)
plt.imshow(cy)
plt.axis("off")
plt.show()

一個完整的python大作業