爬百度新聞

阿新 • • 發佈：2017-10-19

多線程 python

#################################
#data:2017-10-1
#version:1.0
# -*- coding:utf-8 -*-
import threading
import re
import urllib2
import chardet
from BeautifulSoup import BeautifulSoup
import time
import os
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
class myThreads(threading.Thread):
    def __init__(self,threadname,filename):
        threading.Thread.__init__(self)
        self.threadname=threadname
        self.filename=filename
    def run(self):
        print "Starting download url:",self.threadname
        extract_news_content(self.threadname,self.filename)
        time.sleep(2)
        print "Exiting " + self.threadname
def remove_js_css(content):
    """ remove the the javascript and the stylesheet and the comment content (<script>....</script> and <style>....</style> <!-- xxx -->) """
    r = re.compile(r‘‘‘<script.*?</script>‘‘‘, re.I | re.M | re.S)
    s = r.sub(‘‘, content)
    r = re.compile(r‘‘‘<style.*?</style>‘‘‘, re.I | re.M | re.S)
    s = r.sub(‘‘, s)
    r = re.compile(r‘‘‘<!--.*?-->‘‘‘, re.I | re.M | re.S)
    s = r.sub(‘‘, s)
    r = re.compile(r‘‘‘<meta.*?>‘‘‘, re.I | re.M | re.S)
    s = r.sub(‘‘, s)
    r = re.compile(r‘‘‘<ins.*?</ins>‘‘‘, re.I | re.M | re.S)
    s = r.sub(‘‘, s)
    return s
def remove_empty_line(content):
    """remove multi space """
    r = re.compile(r‘‘‘^\s+$‘‘‘, re.M | re.S)
    s = r.sub(‘‘, content)
    r = re.compile(r‘‘‘\n+‘‘‘, re.M | re.S)
    s = r.sub(‘\n‘, s)
    return s
def remove_any_tag(s):
    s = re.sub(r‘‘‘<[^>]+>‘‘‘, ‘‘, s)
    return s.strip()
def remove_any_tag_but_a(s):
    text = re.findall(r‘‘‘<a[^r][^>]*>(.*?)</a>‘‘‘, s, re.I | re.S | re.S)
    text_b = remove_any_tag(s)
    return len(‘‘.join(text)), len(text_b)
def remove_image(s, n=50):
    image = ‘a‘ * n
    r = re.compile(r‘‘‘<img.*?>‘‘‘, re.I | re.M | re.S)
    s = r.sub(image, s)
    return s
def remove_video(s, n=1000):
    video = ‘a‘ * n
    r = re.compile(r‘‘‘<embed.*?>‘‘‘, re.I | re.M | re.S)
    s = r.sub(video, s)
    return s
def sum_max(values):
    cur_max = values[0]
    glo_max = -999999
    left, right = 0, 0
    for index, value in enumerate(values):
        cur_max += value
        if (cur_max > glo_max):
            glo_max = cur_max
            right = index
        elif (cur_max < 0):
            cur_max = 0
    for i in range(right, -1, -1):
        glo_max -= values[i]
        if abs(glo_max < 0.00001):
            left = i
            break
    return left, right + 1
def method_1(content, k=1):
    if not content:
        return None, None, None, None
    tmp = content.split(‘\n‘)
    group_value = []
    for i in range(0, len(tmp), k):
        group = ‘\n‘.join(tmp[i:i + k])
        group = remove_image(group)
        group = remove_video(group)
        text_a, text_b = remove_any_tag_but_a(group)
        temp = (text_b - text_a) - 8
        group_value.append(temp)
    left, right = sum_max(group_value)
    return left, right, len(‘\n‘.join(tmp[:left])), len(‘\n‘.join(tmp[:right]))
def extract(content):
    content = remove_empty_line(remove_js_css(content))
    left, right, x, y = method_1(content)
    return ‘\n‘.join(content.split(‘\n‘)[left:right])
# 輸入url，將其新聞頁的正文輸入txt
def extract_news_content(web_url, file_name):
    html=""
    request = urllib2.Request(web_url)
    # 在請求加上頭信息，偽裝成瀏覽器訪問
    request.add_header(‘User-Agent‘,
                       ‘Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6‘)
    opener = urllib2.build_opener()
    try:
        html = opener.open(request).read()
    except urllib2.HTTPError, e:
        print e.code
    except urllib2.URLError, e:
        print e.reason
    infoencode = chardet.detect(html)[‘encoding‘]  ##通過第3方模塊來自動提取網頁的編碼
    if html != None and infoencode != None:  # 提取內容不為空，error.或者用else
        html = html.decode(infoencode, ‘ignore‘)
        soup = BeautifulSoup(html)
        content = soup.renderContents()
        content_text = extract(content)  # 提取新聞網頁中的正文部分，化為無換行的一段文字
        content_text = re.sub(" ", " ", content_text)
        content_text = re.sub("&gt;", "", content_text)
        content_text = re.sub("&quot;", ‘""‘, content_text)
        content_text = re.sub("<[^>]+>", "", content_text)
        content_text = re.sub("\n", "", content_text)
        file = open(file_name, ‘a‘)  # append
        file.write(content_text)
        file.close()
# 抓取百度新聞搜索結果:中文搜索，前10頁，url：key=關鍵詞
def search(key):
    search_url = ‘http://news.baidu.com/ns?word=key_word&tn=news&from=news&cl=2&rn=20&ct=1‘
    req2 = urllib2.Request(search_url.replace(‘key_word‘, key))
    req2.add_header(‘User-Agent‘,
                       ‘Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6‘)
    req = urllib2.urlopen(req2, timeout=10)
    real_visited = 0
    threads=[]
    for count in range(50):  # 前10頁
        html = req.read()
        soup = BeautifulSoup(html)
        content = soup.findAll("div", {"class": "result"})  # resultset object
        num = len(content)   #每頁20行數據
        print "data:",num,count
        file_dir = r"E:\\Python27\\newscn\\%s" % (key.encode(‘gb2312‘))
        if not os.path.exists(file_dir):
            os.mkdir(file_dir)
        for i in range(num):
            # 先解析出來所有新聞的標題、來源、時間、url
            p_str = content[i].find(‘a‘)  # if no result then nontype object
            contenttitle = p_str.renderContents()
            contenttitle = contenttitle.decode(‘utf-8‘, ‘ignore‘)  # need it
            contenttitle = re.sub("<[^>]+>", "", contenttitle)
            contentlink = str(p_str.get("href"))
            # 存放順利抓取的url，對比
            visited_url = open(r‘E:\\Python27\\visited-cn.txt‘, ‘r‘)  # 是否已經爬過
            visited_url_list = visited_url.readlines()
            visited_url.close()  # 及時close
            exist = 0
            for itme in visited_url_list:
                if itme.strip(‘\n‘) == contentlink:
                    exist=1
                    continue
            print "url:",contentlink,"status",exist
            if exist != 1:  # 如果未被訪問url
                p_str2 = content[i].find(‘p‘).renderContents()
                contentauthor = p_str2[:p_str2.find(" &nbsp")]  # 來源
                contentauthor = contentauthor.decode(‘utf-8‘, ‘ignore‘)  # 時
                contenttime = p_str2[p_str2.find(" &nbsp") + len(" &nbsp") + 1:]
                contenttime = contenttime.decode(‘utf-8‘, ‘ignore‘)
                real_visited += 1
                file_name = r"E:\\Python27\\newscn\\%s\\%d.txt" % (key.encode(‘gb2312‘),real_visited)
                file = open(file_name, ‘w‘)
                file.write(contenttitle.encode(‘utf-8‘))
                file.write(u‘\n‘)
                file.write(contentauthor.encode(‘utf-8‘))
                file.write(u‘\n‘)
                file.write(contenttime.encode(‘utf-8‘))
                file.write(u‘\n‘ + contentlink + u‘\n‘)
                file.close()
                threadnew=myThreads(contentlink,file_name)
                threads.append(threadnew)
#               extract_news_content(contentlink, file_name)  # 還寫入文件
                visited_url_list.append(contentlink)  # 訪問之
                visited_url = open(r‘E:\\Python27\\visited-cn.txt‘, ‘a‘)  # 標記為已訪問，永久存防止程序停止後丟失
                visited_url.write(contentlink + u‘\n‘)
                visited_url.close()
            if len(visited_url_list) >= 120:
                break
                # 解析下一頁
        print "page:",count,"url address:",visited_url_list
        if len(visited_url_list) >= 240:# 最多12頁
            break
        if count == 0:
            next_num = 0
        else:
            next_num = 1
        try:
            next_page = ‘http://news.baidu.com‘ + soup(‘a‘, {‘href‘: True, ‘class‘: ‘n‘})[next_num][ ‘href‘]  # search for the next page#翻頁
        except:
            break
        req2 = urllib2.Request(next_page)
        req2.add_header(‘User-Agent‘,
                        ‘Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6‘)
        req=urllib2.urlopen(req2, timeout=10)
    for th in threads:
        th.setDaemon(True)
        th.start()
    for th in threads:
        th.join()
if __name__ == ‘__main__‘:
    key_word = raw_input(‘input key word:‘)
    search(key_word)

本文出自 “12758454” 博客，請務必保留此出處http://12768454.blog.51cto.com/12758454/1973941

爬百度新聞

多線程 python ################################# #data:2017-10-1 #version:1.0 # -*- coding:utf-8 -*- import threading import re import urllib2 import chard

爬百度100次

框架 url odin class range aid quest htm ise 1 import requests 2 import time 3 4 def getHTMLText(url): 5 try: 6 r= requ

人工智慧測試-爬百度成語-測成語接龍

前言本意，昨晚想發一文，在梳理思路找筆記一小半時，一朋友跟伴侶吵架了，突然從技術寫文轉變到情感“磚家”，微信聊了一個多小時，腦力都用光了，早上開會上傳了一下調整後的程式碼，中午補一下文，完成既定目標。一、起因去年在測試公司的人工智慧產品中的一功能【成語接龍】，人工語音測試總玩不過【琥珀】

python爬蟲爬百度雲盤的資源

最近百度雲盤不知道為啥不提供資源檢索，正好最近看了一下python，正好來練練手，寫歌爬蟲爬一下百度雲盤的資源。分析了一下百度雲盤的網友原始碼和js檔案，裡面有大量ajax的東西，利用json傳輸資料，前端顯示。話說，這樣資料爬去就方便多了，也不要用scrapy啥的

java爬取百度首頁源代碼

clas read 意思出現異常 nts java.net new 有意思 all 爬蟲感覺挺有意思的，寫一個最簡單的抓取百度首頁html代碼的程序。雖然簡單了一點，後期會加深的。 1 package test; 2 3 import java.io.B

requests+xpath+map爬取百度貼吧

name ads int strip 獲取 app open http col 1 # requests+xpath+map爬取百度貼吧 2 # 目標內容:跟帖用戶名,跟帖內容,跟帖時間 3 # 分解: 4 # requests獲取網頁 5 # xpath提取內

Python開發簡單爬蟲（二）---爬取百度百科頁面數據

class 實例實例代碼編碼 mat 分享 aik logs title 一、開發爬蟲的步驟 1.確定目標抓取策略：打開目標頁面，通過右鍵審查元素確定網頁的url格式、數據格式、和網頁編碼形式。 ①先看url的格式, F12觀察一下鏈接的形式;② 再看目標文本信息的

python爬取百度搜索圖片

知乎需要 with 異常 mage 不足 request height adr 在之前通過爬取貼吧圖片有了一點經驗，先根據之前經驗再次爬取百度搜索界面圖片廢話不說，先上代碼 #!/usr/bin/env python # -*- coding: utf-8 -*- #

Python爬取百度貼吧數據

utf-8 支持我 family encode code word keyword 上一條時間　　本渣除了工作外，在生活上還是有些愛好，有些東西，一旦染上，就無法自拔，無法上岸，從此走上一條不歸路。花鳥魚蟲便是我堅持了數十年的愛好。　　本渣還是需要上班，才能支持我的

Python簡易爬蟲爬取百度貼吧圖片

decode works 接口 def 讀取 min baidu 得到 internal 　　　　　通過python 來實現這樣一個簡單的爬蟲功能，把我們想要的圖片爬取到本地。(Python版本為3.6.0) 一.獲取整個頁面數據　　 def getHtml(url)

python爬取百度搜索結果ur匯總

百度搜索 sta attr amp end rom range 百度篩選寫了兩篇之後，我覺得關於爬蟲，重點還是分析過程分析些什麽呢： 1）首先明確自己要爬取的目標　　比如這次我們需要爬取的是使用百度搜索之後所有出來的url結果 2）分析手動進行的獲取目標的過程，以便

爬蟲實例——爬取python百度百科相關一千個詞條

管理器 name 詞條 enc aik lib cnblogs response ons 調度器： import url_manager,html_downloader,html_parser,html_outputer class SpiderMain(object

python 爬取百度url

style not 域名 head dex fin compile threads www 1 #!/usr/bin/env python 2 # -*- coding: utf-8 -*- 3 # @Date : 2017-08-29 18:38:23 4

多線程爬取百度百科

lib item put 腳本 mit sin find client rtl 前言：EVERNOTE裏的一篇筆記，我用了三個博客才學完...真的很菜...百度百科和故事網並沒有太過不一樣，修改下編碼，debug下，就可以爬下來了，不過應該是我爬的東西太初級了，而且我爬到

【學習筆記】python爬取百度真實url

python 今天跑個腳本需要一堆測試的url，，，挨個找復制粘貼肯定不是程序員的風格，so，還是寫個腳本吧。環境：python2.7 編輯器：sublime text 3 一、分析一下首先非常感謝百度大佬的url分類非常整齊，都在一個

Python爬蟲實例（一）爬取百度貼吧帖子中的圖片

選擇圖片查看負責 targe mpat wid agent html headers 程序功能說明：爬取百度貼吧帖子中的圖片，用戶輸入貼吧名稱和要爬取的起始和終止頁數即可進行爬取。思路分析：一、指定貼吧url的獲取例如我們進入秦時明月吧，提取並分析其有效url如下

百度貼吧爬取(可以指定貼吧名及頁碼)

百度貼吧爬蟲 python#!/usr/bin/python # coding=utf-8 import urllib import urllib2 def loadPage(url,filename): ‘‘‘ 作用:根據URL發送請求,獲取服務器響應文件 html:返回的響應文

Python爬去百度音樂

百度音樂編譯器環境：Python3.6代碼：#!/usr/bin/env python #-*-coding=utf-8 -*- #AUTHOR:duwentao import requests import re import json def get_sids_by_name(name):

利用百度搜索結果爬取郵箱

.... sheet pro 編輯部 pic exception exc gecko 正則表達幫同學做一個關於爬取教授郵箱的任務，在百度搜索中輸入教授的名字+長江學者+郵箱，爬取並篩選每個教授的郵箱，最後把郵箱信息寫入到Excel表中：--爬取結果爭取率大概在50%-60

零基礎掌握百度地圖興趣點獲取POI爬蟲（python語言爬取）（基礎篇）

region map 基礎輸入 filter put mark page -h 實現目的：爬取昆明市範圍內的全部中學數據，包括名稱、坐標。先進入基礎篇，本篇主要講原理方面，並實現步驟分解，為python代碼編寫打基礎。因為是0基礎開始，所以講得會比較詳細。如實現目的

爬百度新聞

相關推薦