Python 爬取b站專欄圖片
阿新 • • 發佈:2020-09-16
當olinr學會了爬蟲。。。
嘿嘿嘿
import urllib.request as urqt import urllib.parse as urps import sys import os import re import shutil tot = 0 def gethtml(url): header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0"} res = urqt.Request(url, headers = header) html = urqt.urlopen(res).read().decode("utf-8") return html def GetIntoPlace(string): os.chdir(r"D:\資訊\python\一些成品\b站專欄圖片爬蟲") have = os.listdir() if string in have: shutil.rmtree(string) os.mkdir(string) os.chdir(string) def getpng(url): global tot, num try: res = urqt.urlopen(url).read() except BaseException: return tot += 1 f = open(str(tot) + '.jpg', 'wb') f.write(res) f.close() print("正在下載第 " + str(tot) + " 張") if tot == num: sys.exit() def getans(html): key = re.compile('img data-src="//.+?\.jpg') have = re.findall(key, html) for per in have: per = "http:" + per[14:] getpng(per) def work(html): key1 = re.compile('a title.+? href=".+?"'); key2 = re.compile('//.+?"') have1 = re.findall(key1, html) for i in have1: now = "http:" + re.findall(key2, i)[0] getans(gethtml(now)) now = input("請輸入想要的圖片:") num = int(input("請輸入想要爬取的圖片數量:")) frm = int(input("請輸入爬取起始頁碼:")) GetIntoPlace(now) now = urps.quote(now, encoding = "utf-8"); while tot < num: url = "https://search.bilibili.com/article?keyword=" + now + "&page=" + str(frm) work(gethtml(url)) frm += 1