1. 程式人生 > >爬蟲之貓眼電影

爬蟲之貓眼電影

獲取電影相關資料

import re
from urllib.request import urlopen
from urllib import request
import pymysql


def get_content(url):
    """
    獲取網頁內容
    :return:
    """
    user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0"
    req = request.Request(url, headers={'User-Agent': user_agent})
    content = urlopen(req).read().decode('utf-8')
    return content


def parser_all_page():
    FilmInfo = []
    conn = pymysql.connect(user='root', password='westos', charset='utf8', autocommit=True, db='FilmInfo')
    with conn:
        cur = conn.cursor()
        create_table = 'CREATE TABLE FilmInfo(FilmName VARCHAR (30),FilmStar VARCHAR(40),ReleaseTime VARCHAR(40));'
        cur.execute(create_table)
        for i in range(10):
            url = 'http://maoyan.com/board/4?offset=%d' %(i*10)
            print("正在爬取第%d頁" %(i+1))
            content = get_content(url)
            pattern = r'<div class="movie-item-info">\s+<p class="name"><a href="/films/\d+" title="\w+" data-act="boarditem-click" data-val="{movieId:\d+}">([^\s+]*)</a></p>\s+<p class="star">\s+主演:([^\s+]*)\s+</p>\s+<p class="releasetime">上映時間:([^\s+]*)</p>\s+</div>'
            Info = re.findall(pattern,content)
            print(Info)
            for film in Info:
                insert_url = 'INSERT INTO FilmInfo VALUES ("%s", "%s" ,"%s");' % (film[0], film[1],film[2])
                cur = conn.cursor()
                res = cur.execute(insert_url)
            FilmInfo.append(Info)

    return FilmInfo

def main():
    parser_all_page()

main()

在這裡插入圖片描述