python 爬蟲應用——校園網搜尋引擎（crawler application——Campus web search engine）

阿新 • • 發佈：2020-07-16

看了《Python專案案例開發從入門到實戰》中爬蟲應用——校園網搜尋引擎，這一章節涉及到的內容有：

資料庫的基本使用
正則表示式
中文分詞

我詳細註釋了其中關於校園網搜尋引擎的程式碼，分享給大家：

  1 import sys
  2 from collections import deque
  3 import urllib
  4 from urllib import request
  5 import re
  6 from bs4 import BeautifulSoup
  7 import lxml
  8 import sqlite3
  9 import 
 jieba
 10 
 11 # 要先定義爬蟲抓取的第一個網址，這裡是是華僑大學的主頁
 12 url = 'https://www.hqu.edu.cn/index.htm'
 13 
 14 # 待爬取連結的集合，使用廣度優先搜尋
 15 unvisited = deque()
 16 
 17 # 已訪問的連結集合
 18 visited = set()
 19 
 20 unvisited.append(url)
 21 
 22 # 建立資料庫連線，沒有則建立資料庫
 23 conn = sqlite3.connect('viewsdu.db')
 24 # 建立遊標物件
 25 c = conn.cursor()
 
 26 # 在 create table 之前先 drop table 是因為如果你的資料庫中已經存在了名叫doc的瀏覽器了，那麼就要再次執行把存在的的doc瀏覽器刪除重建
 27 # ！！！！如果之前不存在名叫doc的瀏覽器，那麼這一句話要註釋掉
 28 c.execute('drop table doc')
 29 # 建立名叫doc的資料庫，包含兩個變數，一個是int型的id，一個是text型的link
 30 c.execute('create table doc(id int primary key, link text)')
 31 # ！！！！如果之前不存在名叫word的瀏覽器，那麼這一句話要註釋掉 

 32 c.execute('drop table word')
 33 # 建立名叫word的資料庫，包含連個變數，一個是varchar(25)型的term，一個是text型別的list
 34 c.execute('create table word(term varchar(25) primary key, list text)')
 35 # 提交資料庫
 36 conn.commit()
 37 # 關閉資料庫
 38 conn.close()
 39 
 40 print('************************** 開始爬取 ****************************')
 41 
 42 cnt = 0
 43 print('開始..........')
 44 # 當還存在帶爬取的網頁的時候就一直執行迴圈
 45 while unvisited:
 46     # 丟擲第一個數
 47     url = unvisited.popleft()
 48     # 已經訪問的連結集合添加當前訪問連結
 49     visited.add(url)
 50     cnt += 1
 51     print('開始抓取第', cnt, '個連結： ', url)
 52 
 53     # 爬取網頁內容
 54     try:
 55         # 開啟網頁
 56         response = request.urlopen(url)
 57         # 讀取網頁內容並使用 utf-8 進行解碼
 58         content = response.read().decode('utf-8')
 59     except:
 60         continue
 61 
 62     # 尋找下一個可爬的連結，因為搜尋範圍是網站內，所以對連結有格式要求，需根據具體情況而定
 63     # 解析網頁內容，可能有集中情況，這也是根據這個網站網頁的具體情況寫的
 64     soup = BeautifulSoup(content, 'lxml')
 65     # print(soup.prettify())
 66     # 找到所有的 target='_blank' 的連結，注意這個要根據爬取的內容進行修改
 67     all_a = soup.find_all('a', {'target': '_blank'})
 68     for a in all_a:
 69         # print(a.attrs['href'])
 70         # 得到href的值
 71         x = a.attrs['href']
 72         # 排除開頭是http，但不是https://www.hqu.edu.cn
 73         if re.match(r'http.+', x):
 74             if not re.match(r'http\:\/\/www\.hqu\.edu\.cn\/.+', x):
 75                 continue
 76         # "/info/1046/20314.htm"
 77         if re.match(r'\/info\/.+', x):
 78             x = 'http://www.hqu.edu.cn' + x
 79         # "info/1046/20314.htm"
 80         elif re.match(r'info/.+', x):
 81             x = 'http://www.hqu.edu.cn/' + x
 82         # "../info/1046/20314.htm"
 83         elif re.match(r'\.\.\/info/.+', x):
 84             x = 'http://www.hqu.edu.cn' + x[2:]
 85         # "../../info/1046/20314.htm"
 86         elif re.match(r'\.\.\/\.\.\/info/.+', x):
 87             x = 'http://www.hqu.edu.cn' + x[5:]
 88         if (x not in visited) and (x not in unvisited):
 89             print(x)
 90             unvisited.append(x)
 91     # 下一頁<a>
 92     a = soup.find('a', {'class': 'Next'})
 93     if a is not None:
 94         x = a.attrs['href']
 95         if re.match(r'xwdt\/.+', x):
 96             x = 'http://www.hqu.edu.cn/index/' + x
 97         else:
 98             x = 'http://www.hqu.edu.cn/index/xwdt' + x
 99         if (x not in visited) and (x not in unvisited):
100             unvisited.append(x)
101 
102     # ************************* 解析網頁內容 ************************* #
103     # 得到網頁標題
104     title = soup.title
105     # 得到網頁內容，注意這個要根據爬取的內容進行修改
106     article = soup.find('div', class_='v_news_content')
107     # 或者使用這個也可以
108     # article = soup.find('div', id='vsb_content')
109     # 作者，注意這個要根據爬取的內容進行修改
110     author = soup.find('span', class_='arti_publisher')
111     # 釋出時間，注意這個要根據爬取的內容進行修改
112     time = soup.find('span', class_='arti_update')
113     # print('title : \n', title)
114     # print('article : \n', article)
115     # print('author : \n', author)
116     # print('time : \n', time)
117 
118     if title is None and article is None and author is None:
119         print('無內容的頁面。')
120         continue
121 
122     elif article is None and author is None:
123         print('只有標題。')
124         title = title.text
125         title = ''.join(title.split())
126         article = ''
127         author = ''
128 
129     elif article is None:
130         print('有標題有作者，缺失內容')
131         title = title.text
132         title = ''.join(title.split())
133         article = ''
134         author = author.get_text("", strip=True)
135         author = ''.join(author.split())
136 
137     elif author is None:
138         print('有標題有內容，缺失作者')
139         title = title.text
140         title = ''.join(title.split())
141         article = article.get_text("", strip=True)
142         article = ''.join(article.split())
143         author = ''
144     else:
145         # 得到標籤中文字內容
146         title = title.text
147         # 去除空格
148         title = ''.join(title.split())
149         # 得到變遷中文字內容，strip=True表示去除空白行
150         article = article.get_text("", strip=True)
151         article = ''.join(article.split())
152         author = author.get_text("", strip=True)
153         author = ''.join(author.split())
154         
155     print('網頁標題：', title)
156 
157     # 對title，article，author內容的詞進行結巴分詞，使用的是搜尋引擎模式的cut_for_search
158     seglist = []
159     for i in [title, article, author]:
160         seggen = jieba.cut_for_search(i)
161         seglist.extend(seggen)
162 
163     # 資料傳輸
164     conn = sqlite3.connect("viewsdu.db")
165     c = conn.cursor()
166     # 在名為doc的資料庫中插入行的一行
167     c.execute('insert into doc values (?,?)', (cnt, url))
168     # 對每個分出的詞語建立倒排詞表
169     for word in seglist:
170         # 檢驗看看這個詞語是否已存在於資料庫
171         c.execute('select list from word where term=?', (word,))
172         result = c.fetchall()
173         # 如果不存在
174         if len(result) == 0:
175             docliststr = str(cnt)
176             # 在word資料庫中插入新的行值
177             c.execute('insert into word values (?, ?)', (word, docliststr))
178         # 如果已存在
179         else:
180             # 提取當前結果返回的內容，返回的就是如 ‘19 19’
181             docliststr = result[0][0]
182             # 加上新得到的內容，返回的內容如 ‘19 19 20’
183             docliststr += ' ' + str(cnt)
184             # 更新word資料庫中term對應的值
185             c.execute('update word set list=? where term=?', (docliststr, word))
186     conn.commit()
187     conn.close()
188 print('詞表建立完畢！！！')

這裡需要注意：關於前端網頁中定義的那些搜尋詞，比如：{'target': '_blank'}，class_='v_news_content'，class_='arti_publisher'，class_='arti_update' 這些內容都是根據爬取的網頁具體情況決定，要學會靈活變通。我這些名字來源主要因為如下：

class_='arti_update' 來源：

class_='v_news_content' 來源：

class_='arti_publisher'

類似這樣得到這些值。

最終的結果是返回得到一個詞表資料庫，如下圖所示：

viewsdu.db 資料庫：

doc 瀏覽器：

word 瀏覽器：