爬蟲之貓眼電影
阿新 • • 發佈:2018-12-23
獲取電影相關資料
import re from urllib.request import urlopen from urllib import request import pymysql def get_content(url): """ 獲取網頁內容 :return: """ user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0" req = request.Request(url, headers={'User-Agent': user_agent}) content = urlopen(req).read().decode('utf-8') return content def parser_all_page(): FilmInfo = [] conn = pymysql.connect(user='root', password='westos', charset='utf8', autocommit=True, db='FilmInfo') with conn: cur = conn.cursor() create_table = 'CREATE TABLE FilmInfo(FilmName VARCHAR (30),FilmStar VARCHAR(40),ReleaseTime VARCHAR(40));' cur.execute(create_table) for i in range(10): url = 'http://maoyan.com/board/4?offset=%d' %(i*10) print("正在爬取第%d頁" %(i+1)) content = get_content(url) pattern = r'<div class="movie-item-info">\s+<p class="name"><a href="/films/\d+" title="\w+" data-act="boarditem-click" data-val="{movieId:\d+}">([^\s+]*)</a></p>\s+<p class="star">\s+主演:([^\s+]*)\s+</p>\s+<p class="releasetime">上映時間:([^\s+]*)</p>\s+</div>' Info = re.findall(pattern,content) print(Info) for film in Info: insert_url = 'INSERT INTO FilmInfo VALUES ("%s", "%s" ,"%s");' % (film[0], film[1],film[2]) cur = conn.cursor() res = cur.execute(insert_url) FilmInfo.append(Info) return FilmInfo def main(): parser_all_page() main()