爬取今日頭條

阿新 • • 發佈：2018-08-17

type 取數 count format mage window chrome tail con

import re
import requests
import json,os
from urllib import request

def get_detail(url,title):
headers = {
‘User-Agent‘:‘Mozilla/5.0 (Windows NTr 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36‘
}
# url = ‘https://www.toutiao.com/a6589905154147877384/#p=3‘
response = requests.get(url,headers=headers)
s = response.text

match_res = re.search(r‘gallery: JSON.parse\((.+?)\)‘,s)
match_count = json.loads(match_res.group(1))
# print((match_count))
result = json.loads(match_count)
print(type(result))

if not os.path.exists(‘download/‘+title):
os.makedirs(‘download/‘+title)

for image_ in result[‘sub_images‘]:
image_url = image_[‘url‘]
fname = image_url.split(‘/‘)[-1]
request.urlretrieve(image_url,‘download/‘+title+‘/‘+fname+‘.jpg‘)

print((result))

def get_url(offset=0):
url = ‘https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab‘
full_url = url.format(offset)
response = requests.get(full_url)

res_json = response.json()
# print(res_json)
# print(type(res_json))
# 循環獲得四頁圖片
number = offset/20
if number <= 4:
number += 1
offset = 20*(number)
for page in res_json[‘data‘]:
if ‘article_url‘ in page:
article_url = page[‘article_url‘]
title = page[‘title‘]
get_detail(article_url,title)
get_url(offset)

if __name__ == ‘__main__‘:
# 從第一頁開始獲取數據
get_url(0)import re
import requests
import json,os
from urllib import request

def get_detail(url,title):
headers = {
‘User-Agent‘:‘Mozilla/5.0 (Windows NTr 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36‘
}
# url = ‘https://www.toutiao.com/a6589905154147877384/#p=3‘
response = requests.get(url,headers=headers)
s = response.text

match_res = re.search(r‘gallery: JSON.parse\((.+?)\)‘,s)
match_count = json.loads(match_res.group(1))
# print((match_count))
result = json.loads(match_count)
print(type(result))

if not os.path.exists(‘download/‘+title):
os.makedirs(‘download/‘+title)

for image_ in result[‘sub_images‘]:
image_url = image_[‘url‘]
fname = image_url.split(‘/‘)[-1]
request.urlretrieve(image_url,‘download/‘+title+‘/‘+fname+‘.jpg‘)

print((result))

def get_url(offset=0):
url = ‘https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab‘
full_url = url.format(offset)
response = requests.get(full_url)

res_json = response.json()
# print(res_json)
# print(type(res_json))
# 循環獲得四頁圖片
number = offset/20
if number <= 4:
number += 1
offset = 20*(number)
for page in res_json[‘data‘]:
if ‘article_url‘ in page:
article_url = page[‘article_url‘]
title = page[‘title‘]
get_detail(article_url,title)
get_url(offset)

if __name__ == ‘__main__‘:
# 從第一頁開始獲取數據
get_url(0)

爬取今日頭條

爬取今日頭條

用接口爬取今日頭條圖片

Python爬取今日頭條段子

使用python-aiohttp爬取今日頭條

爬取今日頭條收藏夾文章列表信息

爬取今日頭條中的圖片

爬取今日頭條

python爬取今日頭條關鍵字圖集

部落格搬家系列（六）-爬取今日頭條文章

爬取今日頭條街拍圖的一次教訓

Ajax爬取今日頭條街拍美圖

python --爬蟲基礎 --爬取今日頭條使用 requests 庫的基本操作, Ajax

python爬蟲爬取今日頭條APP資料（無需破解as ,cp，_cp_signature引數）

(爬蟲)採用BeautifulSoup和正則爬取今日頭條圖集.詳細!

Python3從零開始爬取今日頭條的新聞【一、開發環境搭建】

通過分析ajax，使用正則表示式爬取今日頭條

Python3從零開始爬取今日頭條的新聞【五、解析頭條視訊真實播放地址並自動下載】

Ajax爬取今日頭條街拍

Python web爬取今日頭條的街拍

python爬取今日頭條圖片

爬取今日頭條收藏夾文章列表資訊

爬取今日頭條

相關推薦