1. 程式人生 > >簡單實現一個爬蟲

簡單實現一個爬蟲

python程式碼如下:

import requests
from requests.exceptions import RequestException
import re
import json

def get_one_page(url):
    try:
        res = requests.get(url)
        if res.status_code == 200:
            return res.text
        return None
    except RequestException:
        return None
def parse_one_page(html):
    pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
                         +'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
                         +'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S)
    items = re.findall(pattern,html)
    for item in items:
        yield{
            'index': item[0],
            'image': item[1],
            'title': item[2],
            'actor': item[3].strip()[3:],
            'time': item[4].strip()[5:],
            'score': item[5] + item[6]
        }
def main():
    url = 'http://maoyan.com/board/4'
    html = get_one_page(url)
    for item in parse_one_page(html):
        print(json.dumps(item, encoding="UTF-8", ensure_ascii=False, sort_keys=False, indent=4))

if __name__ == '__main__':
    main()

執行結果:

{
    "index": "1", 
    "title": "霸王別姬", 
    "image": "http://p1.meituan.net/movie/[email protected]_220h_1e_1c", 
    "actor": "張國榮,張豐毅,鞏俐", 
    "score": "9.6", 
    "time": "1993-01-01(中國香港)"
}
{
    "index": "2", 
    "title": "肖申克的救贖", 
    "image": "http://p0.meituan.net/movie/[email protected]
_220h_1e_1c", "actor": "蒂姆·羅賓斯,摩根·弗里曼,鮑勃·岡頓", "score": "9.5", "time": "1994-10-14(美國)" } { "index": "3", "title": "羅馬假日", "image": "http://p0.meituan.net/movie/23/[email protected]_220h_1e_1c", "actor": "格利高利·派克,奧黛麗·赫本,埃迪·艾伯特", "score": "9.1", "time": "1953-09-02(美國)" } { "index": "4", "title": "這個殺手不太冷", "image": "http://p0.meituan.net/movie/
[email protected]
_220h_1e_1c", "actor": "讓·雷諾,加里·奧德曼,娜塔莉·波特曼", "score": "9.5", "time": "1994-09-14(法國)" } { "index": "5", "title": "教父", "image": "http://p0.meituan.net/movie/92/[email protected]_220h_1e_1c", "actor": "馬龍·白蘭度,阿爾·帕西諾,詹姆斯·凱恩", "score": "9.3", "time": "1972-03-24(美國)" } { "index": "6", "title": "泰坦尼克號", "image": "http://p0.meituan.net/movie/11/[email protected]_220h_1e_1c", "actor": "萊昂納多·迪卡普里奧,凱特·溫絲萊特,比利·贊恩", "score": "9.5", "time": "1998-04-03" } { "index": "7", "title": "龍貓", "image": "http://p0.meituan.net/movie/99/[email protected]_220h_1e_1c", "actor": "日高法子,阪本千夏,糸井重裡", "score": "9.2", "time": "1988-04-16(日本)" } { "index": "8", "title": "唐伯虎點秋香", "image": "http://p0.meituan.net/movie/62/[email protected]_220h_1e_1c", "actor": "周星馳,鞏俐,鄭佩佩", "score": "9.2", "time": "1993-07-01(中國香港)" } { "index": "9", "title": "千與千尋", "image": "http://p0.meituan.net/movie/[email protected]_220h_1e_1c", "actor": "柊瑠美,入野自由,夏木真理", "score": "9.3", "time": "2001-07-20(日本)" } { "index": "10", "title": "魂斷藍橋", "image": "http://p0.meituan.net/movie/12/[email protected]_220h_1e_1c", "actor": "費雯·麗,羅伯特·泰勒,露塞爾·沃特森", "score": "9.2", "time": "1940-05-17(美國)" }