1. 程式人生 > >爬取今日頭條

爬取今日頭條

type 取數 count format mage window chrome tail con

import re
import requests
import json,os
from urllib import request

def get_detail(url,title):
headers = {
‘User-Agent‘:‘Mozilla/5.0 (Windows NTr 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36‘
}
# url = ‘https://www.toutiao.com/a6589905154147877384/#p=3‘
response = requests.get(url,headers=headers)
s = response.text

match_res = re.search(r‘gallery: JSON.parse\((.+?)\)‘,s)
match_count = json.loads(match_res.group(1))
# print((match_count))
result = json.loads(match_count)
print(type(result))

if not os.path.exists(‘download/‘+title):
os.makedirs(‘download/‘+title)




for image_ in result[‘sub_images‘]:
image_url = image_[‘url‘]
fname = image_url.split(‘/‘)[-1]
request.urlretrieve(image_url,‘download/‘+title+‘/‘+fname+‘.jpg‘)


print((result))

def get_url(offset=0):
url = ‘https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab‘
full_url = url.format(offset)
response = requests.get(full_url)

res_json = response.json()
# print(res_json)
# print(type(res_json))
# 循環獲得四頁圖片
number = offset/20
if number <= 4:
number += 1
offset = 20*(number)
for page in res_json[‘data‘]:
if ‘article_url‘ in page:
article_url = page[‘article_url‘]
title = page[‘title‘]
get_detail(article_url,title)
get_url(offset)

if __name__ == ‘__main__‘:
# 從第一頁開始獲取數據
get_url(0)import re
import requests
import json,os
from urllib import request

def get_detail(url,title):
headers = {
‘User-Agent‘:‘Mozilla/5.0 (Windows NTr 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36‘
}
# url = ‘https://www.toutiao.com/a6589905154147877384/#p=3‘
response = requests.get(url,headers=headers)
s = response.text

match_res = re.search(r‘gallery: JSON.parse\((.+?)\)‘,s)
match_count = json.loads(match_res.group(1))
# print((match_count))
result = json.loads(match_count)
print(type(result))

if not os.path.exists(‘download/‘+title):
os.makedirs(‘download/‘+title)




for image_ in result[‘sub_images‘]:
image_url = image_[‘url‘]
fname = image_url.split(‘/‘)[-1]
request.urlretrieve(image_url,‘download/‘+title+‘/‘+fname+‘.jpg‘)


print((result))

def get_url(offset=0):
url = ‘https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab‘
full_url = url.format(offset)
response = requests.get(full_url)

res_json = response.json()
# print(res_json)
# print(type(res_json))
# 循環獲得四頁圖片
number = offset/20
if number <= 4:
number += 1
offset = 20*(number)
for page in res_json[‘data‘]:
if ‘article_url‘ in page:
article_url = page[‘article_url‘]
title = page[‘title‘]
get_detail(article_url,title)
get_url(offset)

if __name__ == ‘__main__‘:
# 從第一頁開始獲取數據
get_url(0)

爬取今日頭條