python爬蟲採集網路資訊
阿新 • • 發佈:2019-02-19
from bs4 import BeautifulSoup
import re
import urllib.parse
import urllib.request
import os
import datetime
import json
# params CategoryId=808 CategoryType=SiteHome ItemListActionName=PostList PageIndex=3 ParentCategoryId=0 TotalPostCount=4000
def getHtml(url,values):
user_agent='Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36'
headers = {'User-Agent':user_agent}
data = urllib.parse.urlencode(values)
response_result = urllib.request.urlopen(url+'?'+data).read()
html = response_result.decode('utf-8')
return html
#獲取資料
def requestCnblogs(index):
print('請求資料')
url = 'https://www.csdn.net/'
value= {
'CategoryId' :808,
'CategoryType' : 'SiteHome',
'ItemListActionName' :'PostList',
'PageIndex' : index,
'ParentCategoryId' : 0,
'TotalPostCount' : 4000
}
result = getHtml(url,value)
return result
#解析最外層
def blogParser(index):
cnblogs = requestCnblogs(index)
soup = BeautifulSoup(cnblogs, 'html.parser' )
all_div = soup.find_all('div', attrs={'class': 'list_con'}, limit=20)
blogs = []
#迴圈div獲取詳細資訊
for item in all_div:
blog = analyzeBlog(item)
blogs.append(blog)
return blogs
#解析每一條資料
def analyzeBlog(item):
result = {}
a_title = find_all(item,'div','title')[0].find_all('a')[0]
if a_title is not None:
# 部落格標題
result["title"] = a_title.string.replace("\n","").strip()
# 部落格連結
result["href"] = a_title['href']
p_summary = find_all(item,'div','summary oneline')
if p_summary is not None:
# 簡介
result["summary"] = p_summary[0].text.replace("\n","").replace("\n","").strip()
footers = find_all(item,'dl','list_userbar')[0]
author = find_all(footers,'dd','name')[0]
# 作者
result["author"] = author.find_all('a')[0].string.replace("\n","").strip()
# 作者url
result["author_url"] = author.find_all('a')[0]['href']
time = find_all(footers,'dd','time')[0].text
result["create_time"] = time.replace("\n","").strip()
comment_str = find_all(footers,'dd','read_num')[0].find_all('span')[0].text
result["comment_num"] = comment_str
view_str = find_all(footers,'dd','common_num ')[0].find_all('span')[0].string
result["view_num"] = view_str
return result
def find_all(item,attr,c):
return item.find_all(attr,attrs={'class':c},limit=1)
def writeToTxt(list_name,file_path):
try:
#這裡直接write item 即可,不要自己給序列化在寫入,會導致json格式不正確的問題
fp = open(file_path,"w+",encoding='utf-8')
print(file_path)
l = len(list_name)
i = 0
fp.write('[')
for item in list_name:
fp.write(str(item))
if i<l-1:
fp.write(',\n')
i += 1
fp.write(']')
fp.close()
except IOError:
print("fail to open file")
def saveBlogs():
for i in range(1,2):
print('request for '+str(i)+'...')
blogs = blogParser(1)
#儲存到檔案
path = createFile()
writeToTxt(blogs,path+'/blog_'+ str(i) +'.json')
print('第'+ str(i) +'頁已經完成')
return 'success'
def createFile():
date = datetime.datetime.now().strftime('%Y-%m-%d')
path = 'F:/Blog/'+date
if os.path.exists(path):
return path
else:
os.mkdir(path)
return path
if __name__ == '__main__':
result = blogParser(1)
print(result)
# print("python爬取CSDN:")
# info = saveBlogs()
# print(info)