Python爬去妹子圖上傳到wordpress並使用阿里雲oss
阿新 • • 發佈:2018-12-09
#!/usr/bin/env python # coding=utf-8 import os import time import threading, datetime, hashlib import oss2 import phpserialize from multiprocessing import Pool, cpu_count import requests import pymysql from bs4 import BeautifulSoup now = datetime.datetime.now() HEADERS = { 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', 'Referer': "http://www.mmjpg.com" } DIR_PATH = r"/var/www/python/mmjpg" # 下載圖片儲存路徑 # 阿里雲主賬號AccessKey擁有所有API的訪問許可權,風險很高。強烈建議您建立並使用RAM賬號進行API訪問或日常運維,請登入 https://ram.console.aliyun.com 建立RAM賬號。 auth = oss2.Auth('ak', 'sk') # Endpoint以杭州為例,其它Region請按實際情況填寫。 bucket = oss2.Bucket(auth, 'http://oss-cn-shenzhen-internal.aliyuncs.com', 'bucket') def save_pic(pic_src, pic_cnt, folder_name): """ 將圖片下載到本地資料夾 """ try: img = requests.get(pic_src, headers=HEADERS, timeout=10) img_name = "pic_cnt_{}.jpg".format(pic_cnt + 1) with open(img_name, 'ab') as f: f.write(img.content) bucket.put_object_from_file('uploads/' + now.strftime('%Y') + '/' + now.strftime('%m') + '/' + folder_name + img_name, DIR_PATH + '/' + folder_name + '/' + img_name) print(img_name) #阿里雲oss訪問地址 return 'http://123.oss-cn-shenzhen.aliyuncs.com/uploads/' + now.strftime('%Y') \ + '/' + now.strftime('%m') \ + '/' + folder_name + img_name except Exception as e: print(e) def make_dir(folder_name): """ 新建套圖資料夾並切換到該目錄下 """ path = os.path.join(DIR_PATH, folder_name) # 如果目錄已經存在就不用再次爬取了,去重,提高效率。存在返回 False,否則反之 if not os.path.exists(path): os.makedirs(path) print(path) os.chdir(path) return True print("Folder has existed!") return False def delete_empty_dir(save_dir): """ 如果程式半路中斷的話,可能存在已經新建好資料夾但是仍沒有下載的圖片的 情況但此時資料夾已經存在所以會忽略該套圖的下載,此時要刪除空資料夾 """ if os.path.exists(save_dir): if os.path.isdir(save_dir): for d in os.listdir(save_dir): path = os.path.join(save_dir, d) # 組裝下一級地址 if os.path.isdir(path): delete_empty_dir(path) # 遞迴刪除空資料夾 if not os.listdir(save_dir): os.rmdir(save_dir) print("remove the empty dir: {}".format(save_dir)) else: print("Please start your performance!") # 請開始你的表演 lock = threading.Lock() # 全域性資源鎖 def urls_crawler(url): """ 爬蟲入口,主要爬取操作 """ try: r = requests.get(url, headers=HEADERS, timeout=10).text # 套圖名,也作為資料夾名 folder_name = BeautifulSoup(r, 'lxml').find( 'h2').text.encode('ISO-8859-1').decode('utf-8') post_tags = [] tags = BeautifulSoup(r, 'lxml').find( 'div', class_='tags').find_all('a') for tag in tags: post_tags.append('"'+tag.text.encode('ISO-8859-1').decode('utf-8')+'"') path_name = hashlib.md5(folder_name.encode('utf-8')).hexdigest()[8:-8] with lock: if make_dir(path_name): # 套圖張數 max_count = BeautifulSoup(r, 'lxml').find( 'div', class_='page').find_all('a')[-2].get_text() # 套圖頁面 page_urls = [url + "/" + str(i) for i in range(1, int(max_count) + 1)] # 圖片地址 img_urls = [] for index, page_url in enumerate(page_urls): result = requests.get( page_url, headers=HEADERS, timeout=10).text # 最後一張圖片沒有a標籤直接就是img所以分開解析 if index + 1 < len(page_urls): img_url = BeautifulSoup(result, 'lxml').find( 'div', class_='content').find('a').img['src'] img_urls.append(img_url) else: img_url = BeautifulSoup(result, 'lxml').find( 'div', class_='content').find('img')['src'] img_urls.append(img_url) oss_img_urls = []; for cnt, url in enumerate(img_urls): oss_img_urls.append(save_pic(url, cnt, path_name)) # 開啟資料庫連線 db = pymysql.connect("127.0.0.1", "root", "123456", "wordpress") # 使用cursor()方法獲取操作遊標 cursor = db.cursor() now_time = now.strftime('%Y-%m-%d %H:%M:%S') try: # SQL 插入語句 sql = """INSERT INTO `wordpress`.`wdposts` (`post_author`, `post_date`, `post_date_gmt`, `post_content`, `post_title`, `post_excerpt`, `post_status`, `comment_status`, `ping_status`, `post_password`, `post_name`, `to_ping`, `pinged`, `post_modified`, `post_modified_gmt`, `post_content_filtered`, `post_parent`, `guid`, `menu_order`, `post_type`, `post_mime_type`, `comment_count`) VALUES (1, '""" + now_time + """', '""" + now_time + """', '[vc_row][vc_column][vc_column_text] <h2>""" + folder_name + """</h2> [/vc_column_text][/vc_column][/vc_row][vc_row full_width="stretch_row_content" css=".vc_custom_1444807993418{padding-right: 35px !important;padding-left: 35px !important;}"][vc_column][ultimate_spacer height="60"][/vc_column][/vc_row][vc_row full_width="stretch_row_content" css=".vc_custom_1444987361109{padding-right: 35px !important;padding-left: 35px !important;}"][vc_column][royal_portfolio portfolio_display_filters="yes" portfolio_display_title="yes" portfolio_display_testimonial="yes" portfolio_posts_number="15" portfolio_columns_rate="+1" portfolio_gutter_horz="17" portfolio_gutter_vert="17" portfolio_stretch_container="yes"][/vc_column][/vc_row]', '""" + folder_name + """', '', 'publish', 'open', 'closed', '', '""" + path_name + """', '', '', '""" + now_time + """', '""" + now_time + """', '', 0, 'http://meizg.louislivi.com/?royal_portfolio=""" + path_name + """', 0, 'royal_portfolio', '', 0)""" # 執行sql語句 cursor.execute(sql) db.commit() # 提交到資料庫執行 post_id = str(cursor.lastrowid) sql = "select term_taxonomy_id from wdterms right join wdterm_taxonomy on wdterm_taxonomy.term_id=wdterms.term_id where name in (" + (",".join(str(i) for i in post_tags)) + ")" cursor.execute(sql) result = cursor.fetchall() sql = "insert into wdterm_relationships(object_id,term_taxonomy_id) VALUES (" + post_id + ",43),(" + post_id + ",30),(" + post_id + ",227941)," term_ides = [] for tag_id in result: sql += "(" + post_id + "," + str(tag_id[0]) + ")," term_ides.append(tag_id[0]) cursor.execute(sql[0:-1]) db.commit() sql = "update wdterm_taxonomy set count=count+1 where term_id in (" + (",".join(str(i) for i in term_ides)) + ")" cursor.execute(sql) db.commit() sql = """INSERT INTO wordpress.wdposts(post_author, post_date, post_date_gmt, post_content, post_title, post_excerpt, post_status, comment_status, ping_status, post_password, post_name, to_ping, pinged, post_modified, post_modified_gmt, post_content_filtered, post_parent, guid, menu_order, post_type, post_mime_type, comment_count) VALUES""" for cnt, url in enumerate(oss_img_urls): sql += """(1, '""" + now_time + """', '""" + now_time + """', '""" + folder_name + str(cnt) + """', '""" + folder_name + str(cnt) + """', '""" + folder_name + str(cnt) + """', 'inherit', 'open','closed', '', '""" + folder_name + str( cnt) + """', '', '', '""" + now_time + """', '""" + now_time + """', '', """ + post_id + """, '""" + url + """', 0, 'attachment', 'image/jpeg', 0),""" cursor.execute(sql[0:-1]) db.commit() sql = """select ID from wdposts where post_parent=""" + post_id + """ order by id asc;""" cursor.execute(sql) result = cursor.fetchall() img_post_ides = [] for i in range(len(result)): img_post_ides.append(result[i][0]) sql = """insert into wdpostmeta(post_id,meta_key,meta_value) values """ pic_cnt = 1 for img_post_id, img_url in zip(img_post_ides, oss_img_urls): img_file_name = path_name + 'pic_cnt_' + str(pic_cnt) img_name = img_file_name + '.jpg' meta_list = { "width": "800", "height": "1200", "hwstring_small": "height='96' width='64'", "file": now.strftime('%Y') + '/' + now.strftime('%m') + '/' + img_name, "sizes": { "thumbnail": { "file": img_file_name + "-150x150.jpg", "width": "150", "height": "150", "mime-type": "image/jpeg", }, "medium": { "file": img_file_name + "-200x300.jpg", "width": "200", "height": "300", "mime-type": "image/jpeg", }, "medium_large": { "file": img_file_name + "-768x1152.jpg", "width": "768", "height": "1152", "mime-type": "image/jpeg", }, "large": { "file": img_file_name + "-683x1024.jpg", "width": "683", "height": "1024", "mime-type": "image/jpeg", }, "royal-similar-items": { "file": img_file_name + "-350x350.jpg", "width": "350", "height": "350", "mime-type": "image/jpeg", }, "royal-search-results": { "file": img_file_name + "-150x150.jpg", "width": "150", "height": "150", "mime-type": "image/jpeg", }, "royal-blog-post": { "file": img_file_name + "-750x450.jpg", "width": "750", "height": "450", "mime-type": "image/jpeg", }, "royal-portfolio-post": { "file": img_file_name + "-500x340.jpg", "width": "500", "height": "340", "mime-type": "image/jpeg", }, "post-thumbnail": { "file": img_file_name + "-800x450.jpg", "width": "800", "height": "450", "mime-type": "image/jpeg", }, "detail": { "file": img_file_name + "-150x150.jpg", "width": "150", "height": "150", "mime-type": "image/jpeg", } }, "image_meta": { "aperture": "0", "credit": "", "camera": "", "caption": "", "created_timestamp": "0", "copyright": "", "focal_length": "0", "iso": "0", "shutter_speed": "0", "title": "", "orientation": "0", "keywords": {}, } } meta_value = phpserialize.dumps(meta_list) sql += """(""" + str(img_post_id) + """,'_wp_attachment_metadata','""" + str(meta_value)[2:-1] + """'), (""" + str(img_post_id) + """,'_wp_attached_file','/var/www/html/wp-content/uploads/""" \ + now.strftime('%Y') + '/' + now.strftime('%m') + '/' + img_name \ + """'),""" pic_cnt += 1 cursor.execute(sql[0:-1]) db.commit() sql = """insert into wdpostmeta(post_id,meta_key,meta_value) values (""" + post_id + """,'_thumbnail_id','""" + str( img_post_ides[-1]) + """'), (""" + post_id + """,'_vc_post_settings','a:1:{s:10:"vc_grid_id";a:0:{}}'), (""" + post_id + """,'slide_template','default'), (""" + post_id + """,'rf_metro_post_width','1x'), (""" + post_id + """,'rf_exc_featured_img','""" + str( img_post_ides[1]) + """'), (""" + post_id + """,'rf_audio_type','embed'), (""" + post_id + """,'rf_audio_embed',''), (""" + post_id + """,'rf_audio_self_mp3',''), (""" + post_id + """,'rf_audio_self_ogg',''), (""" + post_id + """,'rf_video_type','embed'), (""" + post_id + """,'rf_video_embed',''), (""" + post_id + """,'rf_video_self_mp4',''), (""" + post_id + """,'rf_video_self_ogv',''), (""" + post_id + """,'rf_gallery_type','stacked'), (""" + post_id + """,'rf_gallery_img_ids','""" + ( ",".join(str(i) for i in img_post_ides)) + """'), (""" + post_id + """,'rf_gallery_imgs_src','""" + (",".join(str(i) for i in oss_img_urls)) + """'), (""" + post_id + """,'rf_back_link','""" + str( int(post_id) - 1) + """'), (""" + post_id + """,'rf_project_desc_title','""" + folder_name + """'), (""" + post_id + """,'rf_project_description','""" + folder_name + """'), (""" + post_id + """,'rf_project_details_title',''), (""" + post_id + """,'rf_project_client','meizg.com'), (""" + post_id + """,'rf_project_url','http://meizg.louislivi.com/?royal_portfolio=""" + path_name + """'), (""" + post_id + """,'rf_testimonial_author','meizg.com'), (""" + post_id + """,'rf_testimonial_content',''), (""" + post_id + """,'rf_revslider_shortcode',''), (""" + post_id + """,'rf_revslider_select','none'), (""" + post_id + """,'rf_project_info_sticky','no'), (""" + post_id + """,'second_featured_img_id','""" + str( img_post_ides[-2]) + """'), (""" + post_id + """,'_wpb_vc_js_status','true'), (""" + post_id + """,'_wpb_shortcodes_custom_css','.vc_custom_1444807993418{padding-right: 35px !important;padding-left: 35px !important;}.vc_custom_1444987361109{padding-right: 35px !important;padding-left: 35px !important;}'), (""" + post_id + """,'rf_enable_project_info','yes'), (""" + post_id + """,'_edit_lock','1535778316:1'), (""" + post_id + """,'_edit_last','1'), (""" + post_id + """,'rf_project_info_offset','0'), (""" + post_id + """,'rf_project_ext_url',''), (""" + post_id + """,'_wp_trash_meta_status','publish'), (""" + post_id + """,'_wp_trash_meta_time','1535773952'), (""" + post_id + """,'_wp_desired_post_slug','gallery-slideshow-3'); """ # 執行sql語句 cursor.execute(sql) # 提交到資料庫執行 db.commit() except: # 如果發生錯誤則回滾 db.rollback() # 關閉資料庫連線 db.close() except Exception as e: print(e) if __name__ == "__main__": today = datetime.date.today() today_time = int(time.mktime(today.timetuple())) cnt_num = int((today_time - 28800 - 1535644800)/86400+1459) #urls = ['http://mmjpg.com/mm/{cnt}'.format(cnt=cnt) # 1459 #for cnt in range(1, 1459)] urls = ['http://mmjpg.com/mm/{cnt}'.format(cnt=cnt) for cnt in range(cnt_num-1, cnt_num)] pool = Pool(processes=cpu_count()) try: delete_empty_dir(DIR_PATH) pool.map(urls_crawler, urls) except Exception: time.sleep(30) delete_empty_dir(DIR_PATH) pool.map(urls_crawler, urls)