1. 程式人生 > >爬蟲_百度圖片下載

爬蟲_百度圖片下載

user XML try -s name afa get odin .get

幫別的院的同學批量下載點圖片,並進行簡單篩選

 1 import requests
 2 import re
 3 import os
 4 from lxml import etree
 5 import json
 6 
 7 
 8 
 9 def get_html(url, param):
10     headers = {
11         User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36
12 } 13 response = requests.get(url, params=param, headers=headers) 14 response.encoding = response.apparent_encoding 15 # 返回json數據,str格式 16 json_str = response.text 17 response.encoding = utf-8 18 return json_str 19 20 21 def parse_page(html): 22 data = json.loads(html, strict=False)
23 objs = data[data][:-1] 24 urls = [] 25 for obj in objs: 26 url = obj[middleURL] 27 key = obj[fromPageTitleEnc] 28 # if ‘紋枯病‘ in key: 29 if 全蝕病 in key: 30 urls.append(url) 31 # if ‘葉銹病‘ in key: 32 # urls.append(url)
33 # elif ‘條銹病‘ in key: 34 # urls.append(url) 35 else: 36 print(該標題被篩選掉: +key) 37 print(len(urls)) 38 a = len(urls) 39 return urls, a 40 # return url_list 41 42 43 def run(keyword, path): 44 url = "https://image.baidu.com/search/acjson" 45 # https://image.baidu.com/search/acjson?ipn=rj&tn=resultjson_com&word=小麥紋枯病矢量圖大圖&pn=30 46 i = 0 47 sum_pic = 0 48 for j in range(30, 1800, 30): 49 params = { 50 "ipn": "rj", 51 "tn": "resultjson_com", 52 "word": keyword, 53 "pn": str(j) 54 } 55 html = get_html(url, params) 56 lists, num_pic = parse_page(html) 57 sum_pic += num_pic 58 59 for item in lists: 60 try: 61 img_data = requests.get(item, timeout=10).content 62 with open(path + "/" + str(i) + ".jpg", "wb") as f: 63 f.write(img_data) 64 f.close() 65 i = i+1 66 except requests.exceptions.ConnectionError: 67 print(can not download) 68 continue 69 70 def make_dir(keyword): 71 path = "images/" 72 path = path+keyword 73 is_exists = os.path.exists(path) 74 if not is_exists: 75 os.makedirs(path) 76 return path 77 else: 78 print(path + 目錄已存在) 79 return path 80 81 82 def main(): 83 # keyword = ‘小麥紋枯病矢量圖大圖‘ 84 keyword = 小麥全蝕病 85 path = make_dir(keyword) 86 run(keyword, path) 87 88 89 90 if __name__ == __main__: 91 main()

爬蟲_百度圖片下載