1. 程式人生 > >利用Python中的matplotlib模組抓取yahoo finance裡的歷史資料並繪圖

       如何自動獲取各個公司的股票歷史資料並繪圖是金融文字情感分析專案裡的必要部分,誠然這些資料在finance.yahoo.com裡可以很方便的看到,但如何利用程式自動獲取、實時顯示卻是個問題。之前一直考慮寫爬蟲來抓取資料,顯然這樣做很費力且效率不高,而Python.matplotlib  module有一finance module能夠很便捷的實現這一功能。 

       finance.py is a collection of modules for collecting , collecting ,analying and plotting financial data.讓我們先看一個example 關於利用matplotlib模組獲取finance.yahoo.com裡的歷史資料並繪圖,先貼程式碼

from pylab import figure, show
from matplotlib.finance import quotes_historical_yahoo
from matplotlib.dates import YearLocator, MonthLocator, DateFormatter
import datetime
date1 = datetime.date( 2012, 1, 1 )
date2 = datetime.date( 2012, 11, 11 )

daysFmt  = DateFormatter('%m-%d-%Y')

quotes = quotes_historical_yahoo('MSFT', date1, date2)
if len(quotes) == 0:
    raise SystemExit

dates = [q[0] for q in quotes]
opens = [q[1] for q in quotes]

fig = figure()
ax = fig.add_subplot(111)
ax.plot_date(dates, opens, '-')

# format the ticks

# format the coords message box
def price(x): return '$%1.2f'%x
ax.fmt_xdata = DateFormatter('%Y-%m-%d')
ax.fmt_ydata = price



quotes_historical_yahoo是一個獲取yahoo歷史資料的函式,需要輸入公司的Ticker Symbol和查詢起止日期,輸出為一緩衝檔案,具體程式碼如下:

def quotes_historical_yahoo(ticker, date1, date2, asobject=False,
                                        adjusted=True, cachename=None):
    Get historical data for ticker between date1 and date2.  date1 and
    date2 are datetime instances or (year, month, day) sequences.

    See :func:`parse_yahoo_historical` for explanation of output formats
    and the *asobject* and *adjusted* kwargs.

    sp = f.quotes_historical_yahoo('^GSPC', d1, d2,
                                asobject=True, adjusted=True)
    returns = (sp.open[1:] - sp.open[:-1])/sp.open[1:]
    [n,bins,patches] = hist(returns, 100)
    mu = mean(returns)
    sigma = std(returns)
    x = normpdf(bins, mu, sigma)
    plot(bins, x, color='red', lw=2)

    cachename is the name of the local file cache.  If None, will
    default to the md5 hash or the url (which incorporates the ticker
    and date range)
    # Maybe enable a warning later as part of a slow transition
    # to using None instead of False.
    #if asobject is False:
    #    warnings.warn("Recommend changing to asobject=None")

    fh = fetch_historical_yahoo(ticker, date1, date2, cachename)

        ret = parse_yahoo_historical(fh, asobject=asobject,
        if len(ret) == 0:
            return None
    except IOError as exc:
        warnings.warn('fh failure\n%s'%(exc.strerror[1]))
        return None

    return ret
def fetch_historical_yahoo(ticker, date1, date2, cachename=None,dividends=False):
    Fetch historical data for ticker between date1 and date2.  date1 and
    date2 are date or datetime instances, or (year, month, day) sequences.

    fh = fetch_historical_yahoo('^GSPC', (2000, 1, 1), (2001, 12, 31))

    cachename is the name of the local file cache.  If None, will
    default to the md5 hash or the url (which incorporates the ticker
    and date range)
    set dividends=True to return dividends instead of price data.  With
    this option set, parse functions will not work

    a file handle is returned

    ticker = ticker.upper()

    if iterable(date1):
        d1 = (date1[1]-1, date1[2], date1[0])
        d1 = (date1.month-1, date1.day, date1.year)
    if iterable(date2):
        d2 = (date2[1]-1, date2[2], date2[0])
        d2 = (date2.month-1, date2.day, date2.year)

    if dividends:
        verbose.report('Retrieving dividends instead of prices')

    urlFmt = 'http://table.finance.yahoo.com/table.csv?a=%d&b=%d&c=%d&d=%d&e=%d&f=%d&s=%s&y=0&g=%s&ignore=.csv'

    url =  urlFmt % (d1[0], d1[1], d1[2],
                     d2[0], d2[1], d2[2], ticker, g)

    if cachename is None:
        cachename = os.path.join(cachedir, md5(url).hexdigest())
    if os.path.exists(cachename):
        fh = open(cachename)
        verbose.report('Using cachefile %s for %s'%(cachename, ticker))
        urlfh = urlopen(url)

        fh = open(cachename, 'wb')
        verbose.report('Saved %s data to cache file %s'%(ticker, cachename))
        fh = open(cachename, 'r')

    return fh


def parse_yahoo_historical(fh, adjusted=True, asobject=False):
    Parse the historical data in file handle fh from yahoo finance.

      If True (default) replace open, close, high, and low prices with
      their adjusted values. The adjustment is by a scale factor, S =
      adjusted_close/close. Adjusted prices are actual prices
      multiplied by S.

      Volume is not adjusted as it is already backward split adjusted
      by Yahoo. If you want to compute dollars traded, multiply volume
      by the adjusted close, regardless of whether you choose adjusted
      = True|False.

      If False (default for compatibility with earlier versions)
      return a list of tuples containing

        d, open, close, high, low, volume

      If None (preferred alternative to False), return
      a 2-D ndarray corresponding to the list of tuples.

      Otherwise return a numpy recarray with

        date, year, month, day, d, open, close, high, low,
        volume, adjusted_close

      where d is a floating poing representation of date,
      as returned by date2num, and date is a python standard
      library datetime.date instance.

      The name of this kwarg is a historical artifact.  Formerly,
      True returned a cbook Bunch
      holding 1-D ndarrays.  The behavior of a numpy recarray is
      very similar to the Bunch.


    lines = fh.readlines()

    results = []

    datefmt = '%Y-%m-%d'

    for line in lines[1:]:

        vals = line.split(',')
        if len(vals)!=7:
            continue      # add warning?
        datestr = vals[0]
        #dt = datetime.date(*time.strptime(datestr, datefmt)[:3])
        # Using strptime doubles the runtime. With the present
        # format, we don't need it.
        dt = datetime.date(*[int(val) for val in datestr.split('-')])
        dnum = date2num(dt)
        open, high, low, close =  [float(val) for val in vals[1:5]]
        volume = float(vals[5])
        aclose = float(vals[6])

        results.append((dt, dt.year, dt.month, dt.day,
                        dnum, open, close, high, low, volume, aclose))
    d = np.array(results, dtype=stock_dt)
    if adjusted:
        scale = d['aclose'] / d['close']
        scale[np.isinf(scale)] = np.nan
        d['open'] *= scale
        d['close'] *= scale
        d['high'] *= scale
        d['low'] *= scale

    if not asobject:
        # 2-D sequence; formerly list of tuples, now ndarray
        ret = np.zeros((len(d), 6), dtype=np.float)
        ret[:,0] = d['d']
        ret[:,1] = d['open']
        ret[:,2] = d['close']
        ret[:,3] = d['high']
        ret[:,4] = d['low']
        ret[:,5] = d['volume']
        if asobject is None:
            return ret
        return [tuple(row) for row in ret]

    return d.view(np.recarray)  # Close enough to former Bunch return


#this example can download the data in finance.yahoo and put in our computers

import os,urllib2,urllib

ticker = 'MSFT'           #the Ticker Symbol
date1 = ( 2012, 1, 1 )    #begining time
date2 = ( 2012, 11, 11 )  #ending time

d1 = (date1[1]-1, date1[2], date1[0])
d2 = (date2[1]-1, date2[2], date2[0])


urlFmt = 'http://table.finance.yahoo.com/table.csv?a=%d&b=%d&c=%d&d=%d&e=%d&f=%d&s=%s&y=0&g=%s&ignore=.csv'
url =  urlFmt % (d1[0], d1[1], d1[2],
                     d2[0], d2[1], d2[2], ticker, g)  #the url of historical data
print url

path = r'C:\Users\yinyao\Desktop\Python code'  #Saving path
file_name = r'\ticker.csv'                #file name
dest_dir = os.path.join(path,file_name)   #located file
urllib.urlretrieve(url,dest_dir)        #download the data and put in located file


