爬取的每個資料儲存格式:{ 圖片的訪問路徑,評論數,點贊數,帖子的內容 }
"img_url": "https://scontent-sin6-2.cdninstagram.com/vp/0e345bfd870f2fb489f091ed5507397f/5C1A8CB6/t51.2885-15/e35/40949123_1104283529724860_6046749716819964824_n.jpg",
"comment_count": 12932,
"like_count": 1321753,
"text": "Featured photo by @maomay__\\nWeekend Hashtag Project: #WHPperspective\\nThis weekend, the goal is to take photos and videos from a different point of view, as in this featured photo by Mao May (@maomay__). Here are some tips to get you started:\\nCapture a familiar subject or scene from an unexpected angle. Get up close and let a face cover the entire frame, or make a puppy look large by shooting from ground-level as she stares down. Find a high vantage point to show the wider context of a festival scene or bustling market.\\nUse geometry to your advantage. Look for graphic lines — in bridges or telephone wires — that converge to a vanishing point in your composition. Find a new way to capture patterns in everyday places, like the wheels of bicycles lined up in a rack, or symmetrical bricks in an unruly garden.\\nPlay an eye trick. Defy gravity with simple editing, like rotating the frame. Recruit a friend to make a well-timed leap, that, when rotated, looks like they’re flying through air. Or turn a dandelion into a human-size parasol by playing with scale and distance.\\n\\nPROJECT RULES: Please add the #WHPperspective hashtag only to photos and videos shared over this weekend and only submit your own visuals to the project. If you include music in your video submissions, please only use music to which you own the rights. Any tagged photo or video shared over the weekend is eligible to be featured next week."
import re import json import time import random import requests from pyquery import PyQuery as pq import hashlib url_base = 'https://www.instagram.com/instagram/' uri = 'https://www.instagram.com/graphql/query/?query_hash=a5164aed103f24b03e7b7747a2d94e3c&variables=%7B%22id%22%3A%22{user_id}%22%2C%22first%22%3A12%2C%22after%22%3A%22{cursor}%22%7D' headers = { 'Connection':'keep-alive', 'Host':'www.instagram.com', 'Referer':'https://www.instagram.com/instagram/', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', 'X-Requested-With':'XMLHttpRequest' } proxy = { 'http': '', 'https': '' } def hashStr(strInfo): h = hashlib.md5() h.update(strInfo.encode("utf-8")) return h.hexdigest() def get_html(url): try: response = requests.get(url, headers=headers, proxies=proxy) if response.status_code == 200: return response.text else: print('請求網頁原始碼錯誤, 錯誤狀態碼:', response.status_code) except Exception as e: print(e) return None def get_json(headers,url): try: response = requests.get(url, headers=headers,proxies=proxy, timeout=10) if response.status_code == 200: return response.json() else: print('請求網頁json錯誤, 錯誤狀態碼:', response.status_code) except Exception as e: print(e) time.sleep(60 + float(random.randint(1, 4000))/100) return get_json(headers,url) def get_samples(html): samples = [] user_id = re.findall('"profilePage_([0-9]+)"', html, re.S)[0] GIS_rhx_gis = re.findall('"rhx_gis":"([0-9a-z]+)"', html, re.S)[0] print('user_id:' + user_id) print(GIS_rhx_gis) doc = pq(html) items = doc('script[type="text/javascript"]').items() for item in items: if item.text().strip().startswith('window._sharedData'): # window._sharedData 的內容轉換為字典 js_data = json.loads(item.text()[21:-1], encoding='utf-8') # 12 張初始頁面圖片資訊 edges = js_data["entry_data"]["ProfilePage"][0]["graphql"]["user"]["edge_owner_to_timeline_media"]["edges"] # 網頁頁面資訊 page_info = js_data["entry_data"]["ProfilePage"][0]["graphql"]["user"]["edge_owner_to_timeline_media"]['page_info'] # 下一頁的索引值AQCSnXw1JsoV6LPOD2Of6qQUY7HWyXRc_CBSMWB6WvKlseC-7ibKho3Em0PEG7_EP8vwoXw5zwzsAv_mNMR8yX2uGFZ5j6YXdyoFfdbHc6942w cursor = page_info['end_cursor'] # 是否有下一頁 flag = page_info['has_next_page'] # 節點資訊篩選 for edge in edges: # 如果是視訊直接跳過 if edge['node']['is_video'] == "true": continue time.sleep(1) # 圖片資訊篩選 sample = {} if edge['node']['display_url']: display_url = edge['node']['display_url'] # print(display_url) sample["img_url"] = display_url sample["comment_count"] = edge['node']['edge_media_to_comment']["count"] sample["like_count"] = edge['node']['edge_liked_by']["count"] print(sample["img_url"]) print(sample["comment_count"]) print(sample["like_count"]) if edge['node']['shortcode']: shortcode = edge['node']['shortcode'] # https://www.instagram.com/p/{shortcode}/?__a=1 textUrl = 'https://www.instagram.com/p/' + shortcode + '/?__a=1' textRespose = get_json(headers,textUrl) # print(textRespose) # print(type(textRespose)) textDict = textRespose['graphql']['shortcode_media']['edge_media_to_caption']['edges'][0]['node'] sample["text"] = str(textDict)[10:-2] print(sample["text"]) samples.append(sample) print(cursor, flag) # AJAX 請求更多資訊 while flag: url = uri.format(user_id=user_id, cursor=cursor) print(url) queryVariables = '{"id":"' + user_id + '","first":12,"after":"' +cursor+ '"}' print(queryVariables) headers['X-Instagram-GIS'] = hashStr(GIS_rhx_gis + ":" + queryVariables) print(headers) js_data = get_json(headers,url) # print(js_data) infos = js_data['data']['user']['edge_owner_to_timeline_media']['edges'] cursor = js_data['data']['user']['edge_owner_to_timeline_media']['page_info']['end_cursor'] flag = js_data['data']['user']['edge_owner_to_timeline_media']['page_info']['has_next_page'] # print(infos) for info in infos: if info['node']['is_video']: continue else: sample = {} display_url = info['node']['display_url'] # print(display_url) sample["img_url"] = display_url sample["comment_count"] = info['node']['edge_media_to_comment']["count"] sample["like_count"] = info['node']['edge_media_preview_like']["count"] if info['node']['shortcode']: time.sleep(1) shortcode = info['node']['shortcode'] # https://www.instagram.com/p/{shortcode}/?__a=1 textUrl = 'https://www.instagram.com/p/' + shortcode + '/?__a=1' textRespose = get_json(headers,textUrl) # print(textRespose) # print(type(textRespose)) textDict = textRespose['graphql']['shortcode_media']['edge_media_to_caption']['edges'][0]['node'] sample["text"] = str(textDict)[10:-2] print(sample["img_url"]) print(sample["comment_count"]) print(sample["like_count"]) print(sample["text"]) samples.append(sample) print(cursor, flag) # 下載120個 返回 if len(samples) > 120: return samples return samples def main(): url = url_base html = get_html(url) samples = get_samples(html) # print(samples) with open("./samples.txt","a",encoding='utf-8') as f: f.write(str(samples)) if __name__ == '__main__': start = time.time() main()