91、Beego框架之爬蟲專案——2020年08月02日19:57:16
阿新 • • 發佈:2020-08-02
91、Beego框架之爬蟲專案
2020年08月02日15:21:32
1、建立資料庫
movie.sql CREATE TABLE `movie_info` ( `id` int(10) unsigned NOT NULL AUTO_INCREMENT, `movie_id` int(11) unsigned NOT NULL COMMENT '電影id', `movie_name` varchar(100) COMMENT '電影名稱', `movie_pic` varchar(200) COMMENT '電影圖片', `movie_director` varchar(50) COMMENT '電影導演', `movie_writer` varchar(50) COMMENT '電影編劇', `movie_country` varchar(50) COMMENT '電影產地', `movie_language` varchar(50) COMMENT '電影語言', `movie_main_character` varchar(50) COMMENT '電影主演', `movie_type` varchar(50) COMMENT '電影型別', `movie_on_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '電影上映時間', `movie_span` varchar(20) COMMENT '電影時長', `movie_grade` varchar(5) COMMENT '電影評分', `remark` varchar(500) DEFAULT '' COMMENT '備註', `_create_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '建立時間', `_modify_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '修改時間', `_status` tinyint(1) DEFAULT '1', PRIMARY KEY (`id`), KEY `idx_movie_id` (`movie_id`), KEY `idx_create_time` (`_create_time`), KEY `idx_modify_time` (`_modify_time`) ) ENGINE=InnoDB AUTO_INCREMENT=20 DEFAULT CHARSET=utf8 COMMENT='電影資訊表';
2、控制層呼叫方法——網頁原始碼寫死
func (c *CrawlMovieController) CrawlMovie() { sMovieHtml := `dasd` //導演 c.Ctx.WriteString(models.GetMovieDirector(sMovieHtml) + "|") //電影名字 c.Ctx.WriteString(models.GetMovieName(sMovieHtml) + "|") //主演 c.Ctx.WriteString(models.GetMovieMainCharacters(sMovieHtml) + "|") c.Ctx.WriteString(models.GetMovieGrade(sMovieHtml) + "|") c.Ctx.WriteString(models.GetMovieGenre(sMovieHtml) + "|") c.Ctx.WriteString(models.GetMovieOnTime(sMovieHtml) + "|") c.Ctx.WriteString(models.GetMovieRunningTime(sMovieHtml) + "|") }
路由層
package routers
import (
"crawl_movie/controllers"
"github.com/astaxie/beego"
)
func init() {
beego.Router("/", &controllers.MainController{})
beego.Router("/crawl_movie", &controllers.CrawlMovieController{}, "*:CrawlMovie")
}
3、models 獲取資料
定義Movieinfo資料
type MovieInfo struct{
Id int64
Movie_id int64
Movie_name string
Movie_pic string
Movie_director string
Movie_writer string
Movie_country string
Movie_language string
Movie_main_character string
Movie_type string
Movie_on_time string
Movie_span string
Movie_grade string
_Create_time string
}
初始化
import (
_ "github.com/go-sql-driver/mysql"
"github.com/astaxie/beego/orm"
"regexp"
)
var (
db orm.Ormer
)
func init() {
orm.Debug = true // 是否開啟除錯模式 除錯模式下會打印出sql語句
orm.RegisterDataBase("default", "mysql", "root:123@tcp(127.0.0.1:3306)/test?charset=utf8", 30)
orm.RegisterModel(new(MovieInfo))
db = orm.NewOrm()
}
獲取導演欄位
func GetMovieDirector(movieHtml string) string{
if movieHtml == ""{
return ""
}
reg := regexp.MustCompile(`<a.*?rel="v:directedBy">(.*?)</a>`)
result := reg.FindAllStringSubmatch(movieHtml, -1)
return string(result[0][1])
}
func GetMovieName(movieHtml string)string{
if movieHtml == ""{
return ""
}
reg := regexp.MustCompile(`<span\s*property="v:itemreviewed">(.*?)</span>`)
result := reg.FindAllStringSubmatch(movieHtml, -1)
return string(result[0][1])
}
func GetMovieGrade(movieHtml string)string{
reg := regexp.MustCompile(`<strong.*?property="v:average">(.*?)</strong>`)
result := reg.FindAllStringSubmatch(movieHtml, -1)
return string(result[0][1])
}
func GetMovieOnTime(movieHtml string) string{
reg := regexp.MustCompile(`<span.*?property="v:initialReleaseDate".*?>(.*?)</span>`)
result := reg.FindAllStringSubmatch(movieHtml, -1)
return string(result[0][1])
}
func GetMovieRunningTime(movieHtml string) string{
reg := regexp.MustCompile(`<span.*?property="v:runtime".*?>(.*?)</span>`)
result := reg.FindAllStringSubmatch(movieHtml, -1)
return string(result[0][1])
}
獲取到多個主演
func GetMovieMainCharacters(movieHtml string)string{
reg := regexp.MustCompile(`<a.*?rel="v:starring">(.*?)</a>`)
result := reg.FindAllStringSubmatch(movieHtml, -1)
mainCharacters := ""
for _,v := range result{
mainCharacters += v[1] + "/"
}
return mainCharacters
}
func GetMovieGenre(movieHtml string)string{
reg := regexp.MustCompile(`<span.*?property="v:genre">(.*?)</span>`)
result := reg.FindAllStringSubmatch(movieHtml, -1)
movieGenre := ""
for _,v := range result{
movieGenre += v[1] + "/"
}
return movieGenre
}
4、通過url獲取原始碼內容
sUrl := "https://movie.douban.com/subject/25827935/"
//sUrl = "https://movie.douban.com/subject/6786002/?from=subject-page"
rsp := httplib.Get(sUrl)
sMovieHtml,err := rsp.String()
if err != nil{
panic(err)
}
5、資料存在資料庫裡
var movieInfo models.MovieInfo
movieInfo.Movie_name = models.GetMovieName(sMovieHtml)
movieInfo.Movie_director = models.GetMovieDirector(sMovieHtml)
movieInfo.Movie_main_character = models.GetMovieMainCharacters(sMovieHtml)
movieInfo.Movie_type = models.GetMovieGenre(sMovieHtml)
movieInfo.Movie_on_time = models.GetMovieOnTime(sMovieHtml)
movieInfo.Movie_grade = models.GetMovieGrade(sMovieHtml)
movieInfo.Movie_span = models.GetMovieRunningTime(sMovieHtml)
id, _ := models.AddMovie(&movieInfo)
c.Ctx.WriteString(fmt.Sprintf("%v", id))
//movie_info.go
func AddMovie(movie_info *MovieInfo)(int64,error){
id,err := db.Insert(movie_info)
return id,err
}
6、找到當前網頁的其他電影連結
func GetMovieUrls(movieHtml string)[]string{
reg := regexp.MustCompile(`<a.*?href="(https://movie.douban.com/.*?)"`)
result := reg.FindAllStringSubmatch(movieHtml, -1)
var movieSets []string
for _,v := range result{
movieSets = append(movieSets, v[1])
}
return movieSets
}
7、使用redis
✘ ⚙ ~ redis-server
78176:C 02 Aug 2020 18:27:30.697 # oO0OoO0OoO0Oo Redis is starting oO0OoO0OoO0Oo
78176:C 02 Aug 2020 18:27:30.699 # Redis version=5.0.5, bits=64, commit=00000000, modified=0, pid=78176, just started
78176:C 02 Aug 2020 18:27:30.699 # Warning: no config file specified, using the default config. In order to specify a config file use redis-server /path/to/redis.conf
78176:M 02 Aug 2020 18:27:30.702 * Increased maximum number of open files to 10032 (it was originally set to 256).
_._
_.-``__ ''-._
_.-`` `. `_. ''-._ Redis 5.0.5 (00000000/0) 64 bit
.-`` .-```. ```\/ _.,_ ''-._
( ' , .-` | `, ) Running in standalone mode
|`-._`-...-` __...-.``-._|'` _.-'| Port: 6379
| `-._ `._ / _.-' | PID: 78176
`-._ `-._ `-./ _.-' _.-'
|`-._`-._ `-.__.-' _.-'_.-'|
| `-._`-._ _.-'_.-' | http://redis.io
`-._ `-._`-.__.-'_.-' _.-'
|`-._`-._ `-.__.-' _.-'_.-'|
| `-._`-._ _.-'_.-' |
`-._ `-._`-.__.-'_.-' _.-'
`-._ `-.__.-' _.-'
`-._ _.-'
`-.__.-'
78176:M 02 Aug 2020 18:27:30.724 # Server initialized
78176:M 02 Aug 2020 18:27:30.724 * Ready to accept connections
✘ ~ redis-cli
127.0.0.1:6379> keys *
1) "url_queue"
127.0.0.1:6379> LRANGE url_queue 0 -1
1) "https://movie.douban.com/feed/subject/25827935/reviews"
2) "https://movie.douban.com/subject/25827935/wishes"
3) "https://movie.douban.com/subject/25827935/collections"
4) "https://movie.douban.com/subject/25827935/doulists"
5) "https://movie.douban.com/subject/25827935/questions/727598/?from=subject"
6) "https://movie.douban.com/subject/25827935/questions/727693/?from=subject"
7) "https://movie.douban.com/subject/25827935/discussion/616357468/"
8) "https://movie.douban.com/subject/25827935/discussion/614770818/"
9) "https://movie.douban.com/subject/25827935/discussion/616710371/"
10) "https://movie.douban.com/subject/25827935/discussion/616710732/"
11) "https://movie.douban.com/subject/25827935/discussion/616717788/"
12) "https://movie.douban.com/review/8085061/#comments"
13) "https://movie.douban.com/review/8085061/"
14) "https://movie.douban.com/review/8096152/#comments"
15) "https://movie.douban.com/review/8096152/"
16) "https://movie.douban.com/review/8090401/#comments"
17) "https://movie.douban.com/review/8090401/"
18) "https://movie.douban.com/review/8087492/#comments"
19) "https://movie.douban.com/review/8087492/"
20) "https://movie.douban.com/review/8092536/#comments"
21) "https://movie.douban.com/review/8092536/"
22) "https://movie.douban.com/review/8087571/#comments"
23) "https://movie.douban.com/review/8087571/"
24) "https://movie.douban.com/review/8100255/#comments"
25) "https://movie.douban.com/review/8100255/"
26) "https://movie.douban.com/review/8083968/#comments"
27) "https://movie.douban.com/review/8083968/"
28) "https://movie.douban.com/review/8085143/#comments"
29) "https://movie.douban.com/review/8085143/"
30) "https://movie.douban.com/review/8076701/#comments"
31) "https://movie.douban.com/review/8076701/"
32) "https://movie.douban.com/subject/25827935/comments?status=P"
33) "https://movie.douban.com/subject/27024903/?from=subject-page"
34) "https://movie.douban.com/subject/27024903/?from=subject-page"
35) "https://movie.douban.com/subject/26862829/?from=subject-page"
36) "https://movie.douban.com/subject/26862829/?from=subject-page"
37) "https://movie.douban.com/subject/11529526/?from=subject-page"
38) "https://movie.douban.com/subject/11529526/?from=subject-page"
39) "https://movie.douban.com/subject/6874741/?from=subject-page"
40) "https://movie.douban.com/subject/6874741/?from=subject-page"
41) "https://movie.douban.com/subject/25716096/?from=subject-page"
42) "https://movie.douban.com/subject/25716096/?from=subject-page"
43) "https://movie.douban.com/subject/4739952/?from=subject-page"
44) "https://movie.douban.com/subject/4739952/?from=subject-page"
45) "https://movie.douban.com/subject/4920528/?from=subject-page"
46) "https://movie.douban.com/subject/4920528/?from=subject-page"
47) "https://movie.douban.com/subject/30166972/?from=subject-page"
48) "https://movie.douban.com/subject/30166972/?from=subject-page"
49) "https://movie.douban.com/subject/26366465/?from=subject-page"
50) "https://movie.douban.com/subject/26366465/?from=subject-page"
51) "https://movie.douban.com/subject/3319755/?from=subject-page"
52) "https://movie.douban.com/subject/3319755/?from=subject-page"
53) "https://movie.douban.com/awards/golden-rooster/31/"
54) "https://movie.douban.com/awards/hkfaa/36/"
55) "https://movie.douban.com/awards/goldenhorse/53/"
56) "https://movie.douban.com/subject/25827935/awards/"
57) "https://movie.douban.com/photos/photo/2372679263/"
58) "https://movie.douban.com/photos/photo/2374163695/"
59) "https://movie.douban.com/video/100597/"
60) "https://movie.douban.com/trailer/203039/#content"
61) "https://movie.douban.com/subject/25827935/mupload"
62) "https://movie.douban.com/subject/25827935/all_photos"
63) "https://movie.douban.com/subject/25827935/trailer#short_video"
64) "https://movie.douban.com/subject/25827935/trailer#trailer"
65) "https://movie.douban.com/celebrity/1365506/"
66) "https://movie.douban.com/celebrity/1365506/"
67) "https://movie.douban.com/celebrity/1328349/"
68) "https://movie.douban.com/celebrity/1328349/"
69) "https://movie.douban.com/celebrity/1349387/"
70) "https://movie.douban.com/celebrity/1349387/"
71) "https://movie.douban.com/celebrity/1275243/"
72) "https://movie.douban.com/celebrity/1275243/"
73) "https://movie.douban.com/celebrity/1274224/"
74) "https://movie.douban.com/celebrity/1274224/"
75) "https://movie.douban.com/celebrity/1274534/"
76) "https://movie.douban.com/celebrity/1274534/"
77) "https://movie.douban.com/help/movie#t0-qs"
78) "https://movie.douban.com/subject/25827935/photos?type=R"
79) "https://movie.douban.com/annual/2019?source=movie_navigation"
80) "https://movie.douban.com/annual/2019?source=navigation"
81) "https://movie.douban.com/review/best/"
82) "https://movie.douban.com/tag/"
83) "https://movie.douban.com/chart"
84) "https://movie.douban.com/tv/"
85) "https://movie.douban.com/explore"
86) "https://movie.douban.com/cinema/nowplaying/"
//redis.go
package models
import (
"github.com/astaxie/goredis"
)
const (
URL_QUEUE = "url_queue"
URL_VISIT_SET = "url_visit_set"
)
var (
client goredis.Client
)
func ConnectRedis(addr string){
client.Addr = addr
}
func PutinQueue(url string){
client.Lpush(URL_QUEUE, []byte(url))
}
func PopfromQueue() string{
res,err := client.Rpop(URL_QUEUE)
if err != nil{
panic(err)
}
return string(res)
}
func GetQueueLength() int{
length,err := client.Llen(URL_QUEUE)
if err != nil{
return 0
}
return length
}
func AddToSet(url string){
client.Sadd(URL_VISIT_SET, []byte(url))
}
func IsVisit(url string) bool{
bIsVisit, err := client.Sismember(URL_VISIT_SET, []byte(url))
if err != nil{
return false
}
return bIsVisit
}
8、遞迴查詢連結
//movie_info.go
package models
import (
_ "github.com/go-sql-driver/mysql"
"github.com/astaxie/beego/orm"
"regexp"
"strings"
)
var (
db orm.Ormer
)
type MovieInfo struct{
Id int64
Movie_id int64
Movie_name string
Movie_pic string
Movie_director string
Movie_writer string
Movie_country string
Movie_language string
Movie_main_character string
Movie_type string
Movie_on_time string
Movie_span string
Movie_grade string
}
func init() {
orm.Debug = true // 是否開啟除錯模式 除錯模式下會打印出sql語句
orm.RegisterDataBase("default", "mysql", "root:123@tcp(127.0.0.1:3306)/test?charset=utf8", 30)
orm.RegisterModel(new(MovieInfo))
db = orm.NewOrm()
}
func AddMovie(movie_info *MovieInfo)(int64,error){
movie_info.Id = 0
id,err := db.Insert(movie_info)
return id,err
}
func GetMovieDirector(movieHtml string) string{
if movieHtml == ""{
return ""
}
reg := regexp.MustCompile(`<a.*?rel="v:directedBy">(.*?)</a>`)
result := reg.FindAllStringSubmatch(movieHtml, -1)
if len(result) == 0{
return ""
}
return string(result[0][1])
}
func GetMovieName(movieHtml string)string{
if movieHtml == ""{
return ""
}
reg := regexp.MustCompile(`<span\s*property="v:itemreviewed">(.*?)</span>`)
result := reg.FindAllStringSubmatch(movieHtml, -1)
if len(result) == 0{
return ""
}
return string(result[0][1])
}
func GetMovieMainCharacters(movieHtml string)string{
reg := regexp.MustCompile(`<a.*?rel="v:starring">(.*?)</a>`)
result := reg.FindAllStringSubmatch(movieHtml, -1)
if len(result) == 0{
return ""
}
mainCharacters := ""
for _,v := range result{
mainCharacters += v[1] + "/"
}
return strings.Trim(mainCharacters, "/")
}
func GetMovieGrade(movieHtml string)string{
reg := regexp.MustCompile(`<strong.*?property="v:average">(.*?)</strong>`)
result := reg.FindAllStringSubmatch(movieHtml, -1)
if len(result) == 0{
return ""
}
return string(result[0][1])
}
func GetMovieGenre(movieHtml string)string{
reg := regexp.MustCompile(`<span.*?property="v:genre">(.*?)</span>`)
result := reg.FindAllStringSubmatch(movieHtml, -1)
if len(result) == 0{
return ""
}
movieGenre := ""
for _,v := range result{
movieGenre += v[1] + "/"
}
return strings.Trim(movieGenre, "/")
}
func GetMovieOnTime(movieHtml string) string{
reg := regexp.MustCompile(`<span.*?property="v:initialReleaseDate".*?>(.*?)</span>`)
result := reg.FindAllStringSubmatch(movieHtml, -1)
if len(result) == 0{
return ""
}
return string(result[0][1])
}
func GetMovieRunningTime(movieHtml string) string{
reg := regexp.MustCompile(`<span.*?property="v:runtime".*?>(.*?)</span>`)
result := reg.FindAllStringSubmatch(movieHtml, -1)
if len(result) == 0{
return ""
}
return string(result[0][1])
}
func GetMovieUrls(movieHtml string)[]string{
reg := regexp.MustCompile(`<a.*?href="(https://movie.douban.com/.*?)"`)
result := reg.FindAllStringSubmatch(movieHtml, -1)
var movieSets []string
for _,v := range result{
movieSets = append(movieSets, v[1])
}
return movieSets
}
//crawlMovie.go
package controllers
import (
"crawl_movie/models"
"github.com/astaxie/beego"
"github.com/astaxie/beego/httplib"
"time"
)
type CrawlMovieController struct {
beego.Controller
}
/**
目前這個爬蟲只能爬取靜態資料 對於像京東的部分動態資料 無法爬取
對於動態資料 可以採用 一個元件 phantomjs
*/
func (c *CrawlMovieController) CrawlMovie() {
var movieInfo models.MovieInfo
//連線到redis
models.ConnectRedis("127.0.0.1:6379")
//爬蟲入口url
sUrl := "https://movie.douban.com/subject/25827935/"
models.PutinQueue(sUrl)
for{
length := models.GetQueueLength()
if length == 0{
break //如果url佇列為空 則退出當前迴圈
}
sUrl = models.PopfromQueue()
//我們應當判斷sUrl是否應該被訪問過
if models.IsVisit(sUrl){
continue
}
rsp := httplib.Get(sUrl)
//設定User-agent以及cookie是為了防止 豆瓣網的 403
rsp.Header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0")
rsp.Header("Cookie", `bid=gFP9qSgGTfA; __utma=30149280.1124851270.1482153600.1483055851.1483064193.8; __utmz=30149280.1482971588.4.2.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; ll="118221"; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1483064193%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_id.100001.4cf6=5afcf5e5496eab22.1482413017.7.1483066280.1483057909.; __utma=223695111.1636117731.1482413017.1483055857.1483064193.7; __utmz=223695111.1483055857.6.5.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _vwo_uuid_v2=BDC2DBEDF8958EC838F9D9394CC5D9A0|2cc6ef7952be8c2d5408cb7c8cce2684; ap=1; viewed="1006073"; gr_user_id=e5c932fc-2af6-4861-8a4f-5d696f34570b; __utmc=30149280; __utmc=223695111; _pk_ses.100001.4cf6=*; __utmb=30149280.0.10.1483064193; __utmb=223695111.0.10.1483064193`)
sMovieHtml,err := rsp.String()
if err != nil{
panic(err)
}
movieInfo.Movie_name = models.GetMovieName(sMovieHtml)
//記錄電影資訊
if movieInfo.Movie_name != ""{
movieInfo.Movie_director = models.GetMovieDirector(sMovieHtml)
movieInfo.Movie_main_character = models.GetMovieMainCharacters(sMovieHtml)
movieInfo.Movie_type = models.GetMovieGenre(sMovieHtml)
movieInfo.Movie_on_time = models.GetMovieOnTime(sMovieHtml)
movieInfo.Movie_grade = models.GetMovieGrade(sMovieHtml)
movieInfo.Movie_span = models.GetMovieRunningTime(sMovieHtml)
models.AddMovie(&movieInfo)
}
//提取該頁面的所有連線
urls := models.GetMovieUrls(sMovieHtml)
for _,url := range urls{
models.PutinQueue(url)
c.Ctx.WriteString("<br>" + url + "</br>")
}
//sUrl 應當記錄到 訪問set中
models.AddToSet(sUrl)
time.Sleep(time.Second)
}
c.Ctx.WriteString("end of crawl!")
}
END
2020年08月02日19:31:37
2020年08月02日19:56:44