python 爬取網頁天天基金

阿新 • • 發佈：2020-08-17

# encoding=utf-8
import pandas as pd
import requests
from lxml import etree
import re
import collections


def fund_code_name():
    """ 篩選天天基金，6千多基金機構的，最近一週收益率排在前50強基金"""
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
        'Referer': 'http://fund.eastmoney.com/data/fundranking.html',
        'Cookie': 'st_si=51694067779834; st_asi=delete; ASP.NET_SessionId=e1pno0koqkcp5es3xyzyrg1n; EMFUND1=null; EMFUND2=null; EMFUND3=null; EMFUND4=null; EMFUND5=null; EMFUND6=null; EMFUND7=null; EMFUND8=null; EMFUND0=null; _adsame_fullscreen_18503=1; EMFUND9=08-16 01:16:38@#$%u4E07%u5BB6%u65B0%u5229%u7075%u6D3B%u914D%u7F6E%u6DF7%u5408@%23%24519191; st_pvi=87492384111747; st_sp=2020-08-16%2000%3A05%3A17; st_inirUrl=http%3A%2F%2Ffund.eastmoney.com%2Fdata%2Ffundranking.html; st_sn=15; st_psi=20200816011636912-0-9218336114'

    }
    response = requests.get(
        url='http://fund.eastmoney.com/data/rankhandler.aspx?op=ph&dt=kf&ft=all&rs=&gs=0&sc=zzf&st=desc&sd=2018-11-26&ed=2019-11-26&qdii'
            '=&tabSubtype=,,,,,&pi=1&pn=6450&dx=1&v=0.6516597604405057', headers=header)
    text = response.text
    data = text.split('=')[1]
    # print(data)
    compile_data = re.findall("{datas:\\[(.*)\\],allRecords", str(data))[0]
    strip_data = str(compile_data).strip('[').strip(']')
    replace_quta = strip_data.replace('"', "")
    quota_arrays = replace_quta.split(",")
    intervals = [[i * 25, (i + 1) * 25] for i in range(258)]
    narrays = []
    for k in intervals:
        start, end = k[0], k[1]
        line = quota_arrays[start:end]
        narrays.append(line)
    header = ["基金程式碼", "基金簡稱", "基金條碼", "日期",
              "單位淨值", "累計淨值", "日增長率", "近1周增長率", "近1月增長率", "近3月", "近半年", "近1年", "近2年", "近3年",
              "今年來", "成立來", "其他1", "其他2", "其他3", "其他4", "其他5", "其他6", "其他7", "其他8", "其他9"]
    df = pd.DataFrame(narrays, columns=header)
    df_part = df[["基金程式碼", "基金簡稱", "日期",
                  "單位淨值", "累計淨值", "日增長率", "近1周增長率", "近1月增長率", "近3月", "近半年"]]

    df_tmp = df_part.sort_values(by=["近1周增長率"], ascending=False, axis=0)
    rank_fund_code = df_tmp.head(50)["基金程式碼"]
    fund_codes_list = rank_fund_code.values.tolist()
    print("前50強基金：", fund_codes_list)
    df_tmp.head(50).to_csv("./本季度前50強基金收益.csv", encoding="utf_8_sig")
    return fund_codes_list


def get_one_fund_stocks(fund_code):
    """根據基金碼,獲取每一支基金的最新一季度所有持倉股票池前10支股票"""
    url = "http://fundf10.eastmoney.com/FundArchivesDatas.aspx?type=jjcc&code={}&topline=10&year=&month=&rt=0.5032668912422176".format(
        fund_code)
    head = {
        "Cookie": "EMFUND1=null; EMFUND2=null; EMFUND3=null; EMFUND4=null; EMFUND5=null; EMFUND6=null; EMFUND7=null; EMFUND8=null; EMFUND0=null; st_si=44023331838789; st_asi=delete; EMFUND9=08-16 22:04:25@#$%u4E07%u5BB6%u65B0%u5229%u7075%u6D3B%u914D%u7F6E%u6DF7%u5408@%23%24519191; ASP.NET_SessionId=45qdofapdlm1hlgxapxuxhe1; st_pvi=87492384111747; st_sp=2020-08-16%2000%3A05%3A17; st_inirUrl=http%3A%2F%2Ffund.eastmoney.com%2Fdata%2Ffundranking.html; st_sn=12; st_psi=2020081622103685-0-6169905557"
        ,
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36"}

    response = requests.get(url, headers=head)
    text = response.text  # html subsitue text
    div = re.findall('content:\\"(.*)\\",arryear', text)[0]
    html_body = '<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8"><title>test</title></head><body>%s</body></html>' % (
        div)
    html = etree.HTML(html_body)
    stock_info = html.xpath('//div[1]/div/table/tbody/tr/td/a')
    stock_money = html.xpath('//div[1]/div/table/tbody/tr/td')
    stock_one_fund = []
    for stock in stock_info:
        if stock.text and stock.text.isdigit():
            stock_one_fund.append(stock.text)
    if len(stock_one_fund)>1:
        print("基金程式碼：{}".format(fund_code), "基金持有前10股票池", stock_one_fund)
    return stock_one_fund  # can return empty list


def static_best_stock(rank=20):
    """ 統計收益最佳前50機構共同持有股票程式碼情況,修改rank數量可調整展示股票排名數目"""
    rank_codes = fund_code_name()
    stocks_array = []
    for index, code in enumerate(rank_codes):
        if index < 1:
            print("<" * 30 + "FBI WARNING近1周收益最高基金的排名高到低排序以及股票池情況" + ">" * 30)
        stocks = get_one_fund_stocks(code)
        if len(stocks) > 1 and stocks:
            stocks_array.extend(stocks)
    count_each_stock = collections.Counter(stocks_array)
    print("<" * 30 + "FBI WARNING,{}".format(static_best_stock.__doc__) + ">" * 30)
    print("#" * 30 + "本季度基金機構共同持有股票數目排行前{}股票程式碼情況".format(rank) + "#" * 30)
    df=pd.DataFrame.from_dict(count_each_stock,orient='index',columns=["持有該股機構數目"])
    df=df.reset_index().rename(columns={"index":"股票程式碼"})
    # for k, v in count_each_stock.items():
    #     print("股票程式碼: ", k, "持有該股票機構數量: ", v)
    df=df.sort_values(by="持有該股機構數目",ascending=False)
    print(df.head(rank))


if __name__ == '__main__':
    static_best_stock()

　備註：本文只為個人練習學習，如果用於違法行為概不負責

python 爬取網頁天天基金

# encoding=utf-8 import pandas as pd import requests from lxml import etree import re import collections def fund_code_name():

只要30行程式碼！7步教會你Python爬取網頁抖音熱門視訊

前言抖音短視訊相信大家都聽過，也不陌生對吧！可以看到海量的短視訊，涵蓋了各大行業。個人覺得抖音有毒，刷著刷著根本停不下來，一看時間就是凌晨3、4點。今天帶大家爬取抖音網頁版的視訊資料！一睹為快吧

利用python爬取網頁圖片

\"\"\"利用python爬取網頁圖片\"\"\" import requests import urllib from bs4 import BeautifulSoup import json

Python爬取網頁資訊的示例

Python爬取網頁資訊的步驟以爬取英文名字網站（https://nameberry.com/）中每個名字的評論內容，包括英文名，使用者名稱，評論的時間和評論的內容為例。

使用Python爬取網頁圖片

下載https://www.mayiwenku.com/p-4957235.html 網頁的MATLAB答案下載一張照片 import requests headers = {\"User-Agent\":\"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72

python爬取網頁圖片

1.爬取一個頁面的圖片 # encoding=gbk import re import requests url=\'http://www.netbian.com/\' data=requests.get(url).text

Python爬取網頁上想要的資料

原始碼如下 from urllib.request import urlopen,Request import urllib.request import re from bs4 import BeautifulSoup

python-爬取網頁

爬蟲簡介　　網路蜘蛛,網路機器人,抓取網路資料的程式　　其實就是用Python程式模仿人點選瀏覽器並訪問網站,而且模仿的越逼真越好

python-爬取網頁簡繁體轉換功能

from selenium.webdriver import Chrome, ChromeOptions from selenium.webdriver.common.by import By import time

python爬取天天基金網所有基金資料

多執行緒+代理池爬取天天基金網、股票資料(無需使用爬蟲框架) 簡介提到爬蟲，大部分人都會想到使用Scrapy工具，但是僅僅停留在會使用的階段。為了增加對爬蟲機制的理解，我們可以手動實現多執行緒的爬蟲過

Python如何使用BeautifulSoup爬取網頁資訊

這篇文章主要介紹了Python如何使用BeautifulSoup爬取網頁資訊,文中通過示例程式碼介紹的非常詳細，對大家的學習或者工作具有一定的參考學習價值,需要的朋友可以參考下

python爬取Ajax動態載入網頁過程解析

常見的反爬機制及處理方式 1、Headers反爬蟲：Cookie、Referer、User-Agent 解決方案: 通過F12獲取headers,傳給requests.get()方法

Python基於pandas爬取網頁表格資料

以網頁表格為例：https://www.kuaidaili.com/free/ 該網站資料存在table標籤，直接用requests，需要結合bs4解析正則/xpath/lxml等，沒有幾行程式碼是搞不定的。

基於Python爬取fofa網頁端資料過程解析

FOFA-網路空間安全搜尋引擎是網路空間資產檢索系統（FOFA）是世界上資料覆蓋更完整的IT裝置搜尋引擎，擁有全球聯網IT裝置更全的DNA資訊。探索全球網際網路的資產資訊，進行資產及漏洞影響範圍分析、應用分佈統計、應

python 爬取指定網頁中的圖片（python crawls the image in the specified page）

來自《Python專案案例開發從入門到實戰》（清華大學出版社鄭秋生夏敏捷主編）中爬蟲應用——抓取百度圖片

python 爬取指定網頁中的圖片精細版（python crawls the image in the specified page fine version）

來自《Python專案案例開發從入門到實戰》（清華大學出版社鄭秋生夏敏捷主編）中爬蟲應用——抓取百度圖片

python如何爬取網頁中的文字

用Python進行爬取網頁文字的程式碼： #!/usr/bin/python # -*- coding: UTF-8 -*- import requests

Python實現爬取網頁中動態載入的資料

在使用python爬蟲技術採集資料資訊時，經常會遇到在返回的網頁資訊中，無法抓取動態載入的可用資料。例如，獲取某網頁中，商品價格時就會出現此類現象。如下圖所示。本文將實現爬取網頁中類似的動態載入的資料。

python Selenium 和 PyAutoGUI合璧爬取網頁攻略

前一段時間在做關於美國請願網站的研究，需要爬取change.org這個請願網站上每個請願的資訊。大致爬蟲順序是：先爬取每個標籤下所有請願的名字和具體網址，訪問每個具體網址爬取請願的發起時間、內容等資訊。這裡就需

另類Python爬蟲，利用pandas庫的read_html()方法爬取網頁表格型資料

文章目錄一、簡介二、原理三、爬取實戰例項1 例項2 一、簡介很多人學習python，不知道從何學起。很多人學習python，掌握了基本語法過後，不知道在哪裡尋找案例上手。很多已經做案例的人，卻不知道

python 爬取網頁天天基金

相關推薦