爬出某學校官網全部文章
阿新 • • 發佈:2020-07-05
第一步 導包
1 import requests 2 from lxml import etree 3 from threading import Thread 4 from queue import Queue 5 import time 6 import redis 7 import re 8 from bs4 import BeautifulSoup 9 import tqdm 10 import os
設定 資料庫連線
pool = redis.ConnectionPool(host='localhost', port=6379, db=num,password="psd") redis = redis.StrictRedis(connection_pool=pool)
初始化函式
def __init__(self): self.index_urls = set() self.two_index_urls = set() self.url_title = {} self.base_url = url
從首頁獲取連結
1 def get_head_index_url(self): 2 url = self.base_url 3 response = requests.get(url)4 html = response.text 5 page = etree.HTML(html) 6 contents = page.xpath('//a[contains(@href,"Category_")]/@href') 7 for i in contents: 8 self.index_urls.add(i)
獲取首頁下級連結下的連結
1 def get_two_index_url(self): 2 for i in self.index_urls: 3 url = self.base_url + iView Code4 response = requests.get(url) 5 html = response.text 6 page = etree.HTML(html) 7 contents = page.xpath('//a[contains(@href,"Category_")]/@href') 8 for i in contents: 9 self.two_index_urls.add(i) 10 self.index_urls |= self.two_index_urls
獲取連結中的可用連結
1 def get_all_url_title_redis(self): 2 for i in self.index_urls: 3 url = self.base_url[:-1] + i 4 try: 5 response = requests.get(url, timeout=5) 6 time.sleep(0.5) 7 html = response.text 8 page = etree.HTML(html) 9 title = page.xpath('//em/a/text()') 10 redis.hset("url_title",url, str(title)) 11 print(url, 'over ') 12 13 except Exception as e: 14 print(e) 15 print("{} get wrong!".format(url))View Code
將可用連結提取
1 def get_all_url_from_redis_set(self): 2 urls = redis.hkeys("url_title") 3 for i in urls: 4 if len(redis.hget("url_title", i)) != 2: 5 redis.hset("can_use_urls", i.decode("utf8"), redis.hget("url_title", i)) 6 print("set {} ok!".format(i.decode("utf8"))) 7 8 def get_all_split_url_to_redis(self): 9 all_page_num = 0 10 for i in redis.hkeys("can_use_urls"): 11 all_page_num += 1 12 head_url = i.decode('utf8') 13 print(head_url) 14 base_url = head_url[:len(head_url) - len('Index.aspx')] 15 modol_url = base_url + "Index_{}" + ".aspx" 16 response = requests.get(head_url, timeout=5) 17 time.sleep(0.5) 18 html = response.text 19 page = etree.HTML(html) 20 url_details = page.xpath('//span[@class="disabled"]/text()') 21 if not url_details: 22 continue 23 max_page = re.search("/共(.*?)頁", str(url_details)).group(1) 24 urls = [head_url] 25 for i in range(2, int(max_page) + 1): 26 urls.append(modol_url.format(i)) 27 all_page_num +=1 28 redis.hset("all_urls", head_url, str(urls)) 29 print("all page :{}".format(all_page_num))View Code
早之前的連結中獲取所有文章頁面的連結
1 def get_all_pag_url_to_redis(self): 2 values = redis.hkeys("all_urls") 3 urls = set() 4 page_num = 0 5 urls_num = 0 6 for url in values: 7 url = url.decode("utf8") 8 split_urls = redis.hget("all_urls", url).decode("utf8") 9 for i in eval(split_urls): 10 try: 11 response = requests.get(i, timeout=5) 12 time.sleep(0.5) 13 html = response.text 14 page = etree.HTML(html) 15 page_urls = page.xpath("//li/a[contains(@href,'Item')]/@href") 16 for page_url in page_urls: 17 urls.add(page_url) 18 print("{} add over".format(page_url)) 19 urls_num +=1 20 print("{} already get all url".format(i)) 21 22 except Exception as e: 23 print(e) 24 print(i) 25 print(url) 26 continue 27 page_num += 1 28 29 print("{} page get!".format(page_num)) 30 print("{} url get!".format(urls_num)) 31 url_s = '' 32 for i in urls: 33 url_s +=','+i 34 print(i) 35 redis.hset('all_splite_url', str(urls), url_s)View Code
在獲取的包含文章頁面的連結中獲取連結
1 def get_all_conten(self): 2 urls = redis.hvals("all_splite_url") 3 urls = urls[0].decode('utf8').split(',') 4 base_url = 'http://www.lzlqc.com' 5 all_page = 0 6 get_page = 0 7 for ur in tqdm.tqdm(urls): 8 url = base_url+ur 9 try: 10 response = requests.get(url, timeout=5) 11 time.sleep(0.5) 12 html = response.text 13 page = etree.HTML(html) 14 path = page.xpath('//em/a/text()|//em/text()') 15 clict_num = 0 16 path_s = '\\' 17 path_s += ''.join([i + '\\' for i in path]) 18 soup = BeautifulSoup(html, "html.parser") 19 title = soup.find(name='div', attrs={'class': "article_infoTitle"}).find(name='span').find( 20 name='font').string 21 author = soup.find(name='div', attrs={'class': 'article_info'}).find( 22 name='span').find(name='font') 23 author = str(author) 24 release_time = re.search('釋出時間:(.*?日)', author).group(1) 25 author = re.search('>(.*?點選數:)', author).group(1) 26 content = soup.find(name='div', attrs='article_content_list') 27 content = re.sub('<[^>]+>', '', str(content)) 28 clict = requests.get( 29 base_url + page.xpath('//div[@class="article_info"]/span/font/script/@src')[0]).text 30 clict_num = re.findall("'(.*?)'", clict)[0] 31 author += clict_num 32 abspath = os.getcwd() 33 abspath_s = abspath +'\gets'+ path_s 34 # print(abspath_s[:-1]) 35 if os.path.isdir(abspath_s[:-1]): 36 pass 37 # print(abspath_s[:-1]) 38 else: 39 os.makedirs(abspath_s[:-1]) 40 # print(path_s) 41 file_name = release_time + '-----' + title 42 with open(abspath_s + file_name + '.txt', 'a', encoding='utf8') as p: 43 p.write(title + "\n") 44 p.write(author) 45 p.write(content) 46 p.write("Chang Time:{}".format(time.asctime())) 47 redis.hset("contents", str(url), title+author+content) 48 get_page += 1 49 # print(abspath_s + title) 50 except Exception as e: 51 print(e) 52 print("url :{} get some wrong!!!!!!!!".format(url)) 53 with open("wrong.txt",'a',encoding='utf8') as P: 54 P.write(url+"\n") 55 all_page += 1 56 continue 57 print("{} all page num".format(all_page)) 58 print("{} get page num".format(get_page))View Code
構造開始函式
1 def run(self): 2 self.get_head_index_url() 3 self.get_two_index_url() 4 self.get_all_url_title_redis() 5 self.get_all_url_from_redis_set() 6 self.get_all_split_url_to_redis() 7 self.get_all_conten()View Code
因為沒有代理 ip,所以沒有使用多執行緒,效率太慢。
大概3小時提取12000多張頁面。