Python web學習 spider小例子
阿新 • • 發佈:2018-12-16
import requests from pyquery import PyQuery as pq url='http://www.zhihu.com/explore' headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' } html=requests.get(url,headers=headers).text doc=pq(html) items=doc('.explore-tab .feed-item').items() for item in items: question=item.find('h2').text() author=item.find('.author-link-line').text() answer=item.find('.content').text() file=open('explore.txt','a',encoding='utf-8') file.write('\n'.join([question,author,answer])) file.write('\n'+'='*50+'\n') file.close()
import urllib.request import json id='1320135280' proxy_addr="122.241.72.191:808" def user_proxy(url,proxy_addr): req=urllib.request.Request(url) req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0") proxy=urllib.request.ProxyHandler({'http':proxy_addr}) opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler) urllib.request.install_opener(opener) data=urllib.request.urlopen(req).read().decode('utf-8') return data def get_containerid(url): data=user_proxy(url,proxy_addr) print(data) content=json.loads(data).get('data') for tag in content.get('tabsInfo').get('tabs'): if tag.get('tab_type')=='weibo': containerid=tag.get('containerid') return containerid def get_user_info(id): url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value=' + id data=user_proxy(url,proxy_addr) print(data) content=json.loads(data).get('data') user={} user['id']=content.get('userInfo').get('id') user['statuses_count']=content.get('userInfo').get('statuses_count') user['gender']=content.get('userInfo').get('gender') user['followers_count']=content.get('userInfo').get('follower_count') user['follow_count']=content.get('userInfo').get('follow_count') user['profile_url']=content.get('userInfo').get('profile_url') yield user def get_weibo(id,file): i=1 while True: url='https://m.weibo.cn/api/container/getIndex?type=uid&value='+id weibo_url='https://m.weibo.cn/api/container/getIndex?type=uid&value='+id+'&containerid='+get_containerid(url)+'&page='+str(i) try: data=user_proxy(weibo_url,proxy_addr) content=json.loads(data).get('data') cards=content.get('cards') if(len(cards)>0): for j in range(len(cards)): print("-----正在爬取第"+str(i)+"頁,第"+str(j)+"條微博------") card_type=cards[j].get('card_type') if(card_type==9): mblog=cards[j].get('mblog') attitudes_count=mblog.get('attitudes_count') comments_count=mblog.get('comments_count') created_at=mblog.get('created_at') reposts_count=mblog.get('reposts_count') scheme=cards[j].get('scheme') text=mblog.get('text') with open(file,'a',encoding='utf-8') as fh: fh.write("----第"+str(i)+"頁,第"+str(j)+"條微博----"+"\n") fh.write("微博地址:"+str(scheme)+"\n"+"釋出時間:"+str(created_at)+"\n"+"微博內容:"+text+"\n"+"點贊數:"+str(attitudes_count)+"\n"+"評論數:"+str(comments_count)+"\n"+"轉發數:"+str(reposts_count)+"\n") i+=1 else: break except Exception as e: print(e) pass url='https://m.weibo.cn/api/container/getIndex?type=uid&value='+id get_user_info(id) get_containerid(url) get_weibo(id,'mayun.txt')
import requests import json from urllib.parse import urlencode from pymongo import MongoClient from pyquery import PyQuery as pq base_url='https://m.weibo.cn/api/container/getIndex?' headers={ 'Host': 'm.weibo.cn', 'Referer': 'https://m.weibo.cn/u/2145291155', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest', } client=MongoClient() db=client['weibo'] collection=db['mayun'] max_page=14 def get_page(page): params={ 'type':'uid', 'value': '2145291155', 'containerid': '1076032145291155', 'page': page, } url=base_url+urlencode(params) try: response=requests.get(url,headers=headers) if response.status_code==200: return response.json() except requests.ConnectionError as e: print('error',e.args) def parse_page(jsonstr): if jsonstr: it = jsonstr.get('data') items=it['cards'] for item in items: item = item.get('mblog') weibo = {} weibo['id'] = item.get('id') weibo['text'] = pq(item.get('text')).text() weibo['attitudes'] = item.get('attitudes_count') weibo['comments'] = item.get('comments_count') weibo['reposts'] = item.get('reposts_count') yield weibo def save_to_mongo(result): if collection.insert(result): print('save to mongo') if __name__=='__main__': for page in range(1,max_page+1): jsonstr=get_page(page) results=parse_page(jsonstr) for result in results: save_to_mongo(result) print(result)