python爬取虎嗅網資料
阿新 • • 發佈:2018-12-14
#!/usr/bin/env python # -*- coding:utf-8 -*- import requests import pymongo from bs4 import BeautifulSoup client = pymongo.MongoClient(host='localhost',port=27017) collection = client['spiders']['huxiu'] url = "https://www.huxiu.com/channel/ajaxGetMore" headers={ "Referer":"https://www.huxiu.com/channel/104.html", "User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36" } def get_total_page(): data = { "huxiu_hash_code": "9501c2ced764ebbe029807a9f17790fa", "page": 1, "catId": 104 } r = requests.post(url, data=data, headers=headers) res_json = r.json() total = res_json['data']['total_page'] return total def main(page): data = { "huxiu_hash_code": "9501c2ced764ebbe029807a9f17790fa", "page": page, "catId": 104 } r = requests.post(url, data=data, headers=headers) res_json = r.json() data = res_json['data']['data'] return data def parse_data(data): bs = BeautifulSoup(data, "lxml") for item in bs.find_all("div",attrs={"class":"mod-art"}): json_atricle={} json_atricle["article_aid"] = item["data-aid"] a_node = item.find("a",attrs={"class":"transition"}) json_atricle["article_title"] = a_node['title'] json_atricle["article_ulr"] = a_node["href"] img_node = a_node.find("img") json_atricle["article_img"] = img_node.get("data-original") if img_node.get("data-original") else img_node.get("src") author_face_node = item.find("div",attrs={"class":"author-face"}) json_atricle["member_url"] = author_face_node.find('a')["href"] json_atricle["author_face"] = author_face_node.find('img')["src"] json_atricle["author_name"]= item.find("span",attrs={"class":"author-name"}).string #資料入庫 collection.insert(json_atricle) print("success") if __name__ =="__main__": pages = get_total_page() for page in range(1,(pages +1)): print("正在爬去第{}頁".format(page)) data = main(page) parse_data(data)