百度經驗正文爬蟲
阿新 • • 發佈:2019-02-19
# -*- coding: utf-8 -*- # !/usr/bin/env python import requests import re import time,random u0 = 'http://jingyan.baidu.com/user/npublic?uid=d1b612bceb0dc22ba8ffe137&pn=' for x in range(0,50,7): url = u0+str(x) reponse = requests.get(url) reponse.encoding='utf-8' html = reponse.text tts = re.findall(r'title="(.*?)" target="_blank">', html) u = re.findall(r'<a href="(/article/\w+\.html)" title="', html) for i in range(len(u)): for j in range(20): tt = 'https://jingyan.baidu.com'+u[i] a = requests.get(tt) a.encoding='utf-8' b = a.text txt = re.findall(r'<p>(.*?)</p>',b) for ii in txt: print(' '+ii) time.sleep(random.uniform(1,2))