初試python爬蟲之:豆瓣電影爬蟲
因為課程需要,前兩天花了一天學習python並寫了一個豆瓣電影的爬蟲。
課程要求是這樣的:
爬取豆瓣網站上,電影排名在前50名的電影,包括電影名字,電影評分,電影簡介,
爬下來的電影資料進行分類,按照不同分類儲存在資料庫/Excel中的不同表中。
python的環境安裝配置,以及語法解釋本次就不提及了。
由於本人的正則不夠熟練,所以之前在用正則寫時並沒有成功爬下來,幾次嘗試失敗之後,本人運用了beautifulSoup來爬。所以如果希望正則方式的朋友就抱歉了。
直接上程式碼。
sql:
# Host: 127.0.0.1 (Version: 5.7.17-log)
# Date: 2018-06-04 11:52:30
# Generator: MySQL-Front 5.3 (Build 4.269)
/*!40101 SET NAMES utf8 */;
drop database if exists pythontest;
create database pythontest default character set utf8;
use pythontest;
#
# Structure for table "愛情表"
#
CREATE TABLE `愛情表` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id',
`name` varchar(30) DEFAULT NULL COMMENT '電影名',
`score` float DEFAULT NULL COMMENT '評分',
`review` varchar(100) DEFAULT NULL COMMENT '簡介',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
#
# Structure for table "傳記表"
#
CREATE TABLE `傳記表` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id',
`name` varchar(30) DEFAULT NULL COMMENT '電影名',
`score` float DEFAULT NULL COMMENT '評分',
`review` varchar(100) DEFAULT NULL COMMENT '簡介',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
#
# Structure for table "動畫表"
#
CREATE TABLE `動畫表` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id',
`name` varchar(30) DEFAULT NULL COMMENT '電影名',
`score` float DEFAULT NULL COMMENT '評分',
`review` varchar(100) DEFAULT NULL COMMENT '簡介',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
#
# Structure for table "動作表"
#
CREATE TABLE `動作表` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id',
`name` varchar(30) DEFAULT NULL COMMENT '電影名',
`score` float DEFAULT NULL COMMENT '評分',
`review` varchar(100) DEFAULT NULL COMMENT '簡介',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
#
# Structure for table "兒童表"
#
CREATE TABLE `兒童表` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id',
`name` varchar(30) DEFAULT NULL COMMENT '電影名',
`score` float DEFAULT NULL COMMENT '評分',
`review` varchar(100) DEFAULT NULL COMMENT '簡介',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
#
# Structure for table "犯罪表"
#
CREATE TABLE `犯罪表` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id',
`name` varchar(30) DEFAULT NULL COMMENT '電影名',
`score` float DEFAULT NULL COMMENT '評分',
`review` varchar(100) DEFAULT NULL COMMENT '簡介',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
#
# Structure for table "歌舞表"
#
CREATE TABLE `歌舞表` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id',
`name` varchar(30) DEFAULT NULL COMMENT '電影名',
`score` float DEFAULT NULL COMMENT '評分',
`review` varchar(100) DEFAULT NULL COMMENT '簡介',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
#
# Structure for table "紀錄片表"
#
CREATE TABLE `紀錄片表` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id',
`name` varchar(30) DEFAULT NULL COMMENT '電影名',
`score` float DEFAULT NULL COMMENT '評分',
`review` varchar(100) DEFAULT NULL COMMENT '簡介',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
#
# Structure for table "家庭表"
#
CREATE TABLE `家庭表` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id',
`name` varchar(30) DEFAULT NULL COMMENT '電影名',
`score` float DEFAULT NULL COMMENT '評分',
`review` varchar(100) DEFAULT NULL COMMENT '簡介',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
#
# Structure for table "驚悚表"
#
CREATE TABLE `驚悚表` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id',
`name` varchar(30) DEFAULT NULL COMMENT '電影名',
`score` float DEFAULT NULL COMMENT '評分',
`review` varchar(100) DEFAULT NULL COMMENT '簡介',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
#
# Structure for table "劇情表"
#
CREATE TABLE `劇情表` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id',
`name` varchar(30) DEFAULT NULL COMMENT '電影名',
`score` float DEFAULT NULL COMMENT '評分',
`review` varchar(100) DEFAULT NULL COMMENT '簡介',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
#
# Structure for table "科幻表"
#
CREATE TABLE `科幻表` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id',
`name` varchar(30) DEFAULT NULL COMMENT '電影名',
`score` float DEFAULT NULL COMMENT '評分',
`review` varchar(100) DEFAULT NULL COMMENT '簡介',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
#
# Structure for table "歷史表"
#
CREATE TABLE `歷史表` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id',
`name` varchar(30) DEFAULT NULL COMMENT '電影名',
`score` float DEFAULT NULL COMMENT '評分',
`review` varchar(100) DEFAULT NULL COMMENT '簡介',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
#
# Structure for table "冒險表"
#
CREATE TABLE `冒險表` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id',
`name` varchar(30) DEFAULT NULL COMMENT '電影名',
`score` float DEFAULT NULL COMMENT '評分',
`review` varchar(100) DEFAULT NULL COMMENT '簡介',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
#
# Structure for table "奇幻表"
#
CREATE TABLE `奇幻表` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id',
`name` varchar(30) DEFAULT NULL COMMENT '電影名',
`score` float DEFAULT NULL COMMENT '評分',
`review` varchar(100) DEFAULT NULL COMMENT '簡介',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
#
# Structure for table "同性表"
#
CREATE TABLE `同性表` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id',
`name` varchar(30) DEFAULT NULL COMMENT '電影名',
`score` float DEFAULT NULL COMMENT '評分',
`review` varchar(100) DEFAULT NULL COMMENT '簡介',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
#
# Structure for table "喜劇表"
#
CREATE TABLE `喜劇表` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id',
`name` varchar(30) DEFAULT NULL COMMENT '電影名',
`score` float DEFAULT NULL COMMENT '評分',
`review` varchar(100) DEFAULT NULL COMMENT '簡介',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
#
# Structure for table "懸疑表"
#
CREATE TABLE `懸疑表` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id',
`name` varchar(30) DEFAULT NULL COMMENT '電影名',
`score` float DEFAULT NULL COMMENT '評分',
`review` varchar(100) DEFAULT NULL COMMENT '簡介',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
#
# Structure for table "音樂表"
#
CREATE TABLE `音樂表` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id',
`name` varchar(30) DEFAULT NULL COMMENT '電影名',
`score` float DEFAULT NULL COMMENT '評分',
`review` varchar(100) DEFAULT NULL COMMENT '簡介',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
#
# Structure for table "災難表"
#
CREATE TABLE `災難表` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id',
`name` varchar(30) DEFAULT NULL COMMENT '電影名',
`score` float DEFAULT NULL COMMENT '評分',
`review` varchar(100) DEFAULT NULL COMMENT '簡介',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
#
# Structure for table "戰爭表"
#
CREATE TABLE `戰爭表` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id',
`name` varchar(30) DEFAULT NULL COMMENT '電影名',
`score` float DEFAULT NULL COMMENT '評分',
`review` varchar(100) DEFAULT NULL COMMENT '簡介',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
本人貪方便並且本次核心不在資料庫,所以資料庫表是用中文建立的。如果各位有要求的話可以自行修改。
python:
#!/usr/bin/env python # _*_coding:utf-8_*_ import requests import pymysql import sys from bs4 import BeautifulSoup downUrl = 'http://movie.douban.com/top250/' def grapInFo(pra): soup = BeautifulSoup(pra, 'html.parser') ol = soup.find('ol', class_='grid_view') name = [] # 名字 score = [] # 評分reviewList = [] # 短評 categoryList = [] # 類別 reload(sys) # Unicode編碼與ASCII編碼的不相容 sys.setdefaultencoding('utf - 8') for i in ol.find_all('li'): detail = i.find('div', attrs={'class': 'hd'}) movieName = detail.find( 'span', attrs={'class': 'title'}).get_text() # 電影名字scoreList = i.find( 'span', attrs={'class': 'rating_num'}).get_text() # 評分 review = i.find('span', attrs={'class': 'inq'}) # 短評 category = i.find('div', attrs={'class' : 'bd'}).find('p').get_text() #類別 category = str(category[category.rfind('/')+2 : len(str(category))].replace(' ', '').strip()) if review: # 判斷是否有短評 reviewList.append(review.get_text()) else: reviewList.append('無') score.append(scoreList) name.append(movieName) categoryList.append(category) page = soup.find('span', attrs={'class': 'next'}).find('a') # 獲取下一頁 if page: return name, score, reviewList, categoryList, downUrl + page['href'] return name, score, reviewList, categoryList, None def main(): url = downUrl name = [] score = [] review = [] category = [] i = 1 headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36' } while i < 3: pra = requests.get(url, headers=headers).content movie, scoreNum, reviewList, categoryList, url = grapInFo(pra) name = name + movie score = score + scoreNum review = review + reviewList category = category + categoryList i = i + 1 # 開啟資料庫連線 db = pymysql.connect("127.0.0.1", "root", "", "pythontest", charset="utf8") # 使用cursor()方法獲取操作遊標 cursor = db.cursor() for (i, o, r, c) in zip(name, score, review, category): print i , " ", o, " ", r, " ", c j = 0 while j < len(c.decode('utf-8')): print c.decode('utf-8')[j: j+2].encode('utf-8') j += 2 sql = "INSERT INTO %s (name, \ score, review) \ VALUES ('%s', '%f', '%s')" % \ (c.decode('utf-8')[j: j+2].encode('utf-8') + '表', str(i), float(o), str(r)) print sql try: cursor.execute(sql) db.commit() except: db.rollback() db.close() if __name__ == '__main__': main()
py2:
#!/usr/bin/python # -*- coding: UTF-8 -*- import requests import sys import pymysql from bs4 import BeautifulSoup url = "https://movie.douban.com/top250" headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'} # 解析網頁 def praseHtml( url, headers): currenturl = url i = 1 # 序號 page = 1 #頁數 # 開啟資料庫連線 db = pymysql.connect("127.0.0.1", "root", "", "pythontest", charset="utf8") # 使用cursor()方法獲取操作遊標 cursor = db.cursor() while currenturl: html = requests.get(currenturl, headers=headers).content # 獲取網頁原始碼 soup = BeautifulSoup(html, 'html.parser') # 解析當前頁,獲取想要的內容 moveList = soup.find('ol', class_='grid_view') reload(sys) sys.setdefaultencoding('utf8') for moveLi in moveList.find_all('li'): movieName = moveLi.find('span', attrs={'class': 'title'}).get_text() #電影名字 star = moveLi.find('span', attrs={'class' : 'rating_num'}).get_text() #星級 summary = moveLi.find('span', attrs={'class' : 'inq'}).get_text() # 簡介 category = moveLi.find('div', attrs={'class' : 'bd'}).find('p').get_text() #類別 category = str(category[category.rfind('/')+2 : len(str(category))].replace(' ', '').strip()) print str(i) + ": " + movieName + " " + star + " " + summary + " " + category j = 0 while j < len(category.decode('utf-8')): print category.decode('utf-8')[j: j + 2].encode('utf-8') sql = "INSERT INTO %s (name, \ score, review) \ VALUES ('%s', '%f', '%s')" % \ (category.decode('utf-8')[j: j + 2].encode('utf-8') + '表', str(movieName), float(star), str(summary)) print sql try: cursor.execute(sql) db.commit() except: db.rollback() j += 2 i += 1 # 下一頁 nextpage = soup.find('span', attrs={'class': 'next'}).find('a') # next = nextpage['href'] #這樣寫報錯:NoneType object is not subscriptable if nextpage: currenturl = url + nextpage['href'] else: currenturl = None page += 1 if page==3: break db.close() praseHtml( url, headers)
本人寫了兩種。原理是一樣的,大家可以自行檢視。
本次爬蟲輸入入門級別比較簡單,所以也不做過多解釋了。
給大家提及一下,本次難點在於豆瓣電影的分類中。由於在相應li中沒有分類的標籤,而是將分類標籤與其他資訊混合起來。
所以對這些字串的處理比較複雜,因為本人習慣用java以及眾多內庫,初學python不太適應,所以本次處理可能有更好的方式。如果各位有更簡潔的方式,不妨留下讓本人學習。
然後是執行結果:
爬蟲挺有趣也挺實用的。之後空閒還會接著學習以及嘗試各種爬蟲。