1. 程式人生 > >Python實現網路爬蟲

Python實現網路爬蟲

#!/usr/bin/env python
# -*- coding: UTF-8 -*-

# Author: GuangJun.Lv
# Date: 2018/07/06

import urllib2
import json
import os
import time
import datetime

import csv

ts = '\t'
Non = 'N/A'
Date_CST = 'CST'
out_txtfile = '/opt/data.txt'
out_csvfile = '/opt/data.csv'

key = '2b0d6572c90d3e4a'
#zmw = '58362'

def isValidDate(date):
    try:
        time.strptime(date, "%Y-%m-%d")
        return True
    except ValueError:
        return False

def isValidwmo(wmo):
    if not len(wmo):
        return False

    thedate = '20170101'
    url = 'http://api.wunderground.com/api/%s/history_%s/q/zmw:00000.1.%s.json?v=wuiapp'   \
           % (key, thedate, wmo.strip())

    f = urllib2.urlopen(url)
    json_string = f.read()
    parsed_json = json.loads(json_string)
    if not len(parsed_json):
        f.close()
        return False

    zmwr = parsed_json['history']['observations']
    if not len(zmwr):
        f.close()
        return False

    f.close()

    return True


def get_zmwcode(cityname):
    zmw = ''
    if not len(cityname):
        return zmw 

    url = 'http://autocomplete.wunderground.com/aq?format=JSON&lang=zh&query=%s' % (cityname)
    f = urllib2.urlopen(url)
    json_string = f.read()
    parsed_json = json.loads(json_string)
    if not len(parsed_json):
        f.close()
        return zmw

    zmwr = parsed_json['RESULTS']
    if not len(zmwr):
        f.close()
        return zmw

    zmw = parsed_json['RESULTS'][0]['zmw']

    f.close()

    return zmw


def GJLv_To_Txt(fp, zmw, thedate):
   
    url = 'http://api.wunderground.com/api/%s/history_%s/q/zmw:%s.json?v=wuiapp'   \
           % (key, thedate, zmw)

    print '\n...........................'
    print 'Date %s, Url: %s' % (thedate, url)
    print '\n...........................\n'

    f = urllib2.urlopen(url)
    json_string = f.read()
    parsed_json = json.loads(json_string)
    observations = parsed_json['history']['observations']

    for item in observations:
        idate = item['date']['pretty']
        iyear = item['date']['year']
        imon = item['date']['mon']
        iday = item['date']['mday']
        itempm = item['tempm']
        idewptm = item['dewptm']
        ihum = item['hum']
        iwspdm = item['wspdm']
        iwgustm = item['wgustm']
        iwdird = item['wdird']
        iwdire = item['wdire']
        ivism = item['vism']
        ipressurem = item['pressurem']
        iprecipm = item['precipm']
        iconds = item['conds']

        # format output date: "12:00 AM CST on January 01, 2017" to "12:00 AM 01/01/2017"
        idate = '%s %s/%s/%s' % (idate[0:idate.find(Date_CST)-1], iyear, imon, iday)

        # format null string to 'N/A'

        linedata = '%s%c%s%c%s%c%s%c%s%c%s%c%s%c%s%c%s%c%s%c%s%c%s%c\n'  \
                   % (idate,ts,itempm,ts,idewptm,ts,ihum,ts,iwspdm,ts,    \
                     iwgustm,ts,iwdird,ts,iwdire,ts,ivism,ts,iprecipm,ts,ipressurem,ts,iconds,ts)

        # write the line data
        fp.write(linedata)

    f.close()
    

def LPQiao_To_Txt():

    print '\n.......Game start...............\n'

    city = raw_input('請輸入城市名稱: ')
    zmw = get_zmwcode(city)
    if not len(zmw):
        print '輸入城市錯誤,請重新輸入......'
        return None

    begin_date_s = raw_input('請輸入起始日期,格式為yyyy-mm-dd: ')
    if not isValidDate(begin_date_s):
        print '輸入日期錯誤,請重新輸入......'
        return None

    end_date_s = raw_input('請輸入結束日期,格式為yyyy-mm-dd: ')
    if not isValidDate(end_date_s):
        print '結束日期錯誤,請重新輸入......'
        return None


    # open the output file, and input the formated data title
    fp = open(out_txtfile, 'w')
    title1 = 'TimeCST%cTemperatureC%cDew PointC%cRelative humidity%cWind Speed%cWind Gust%c' % (ts,ts,ts,ts,ts,ts)
    title2 = 'WindDirDegrees%cWind Direction%cVisibility%cSea Level Pressure%cPrecipitation%cConditions\n ' % (ts,ts,ts,ts,ts)
    fp.write(title1)
    fp.write(title2)


    begin_date = time.strptime(begin_date_s, "%Y-%m-%d")
    end_date = time.strptime(end_date_s, "%Y-%m-%d")
    begin = datetime.date(begin_date[0], begin_date[1], begin_date[2])
    end = datetime.date(end_date[0], end_date[1], end_date[2])

    d = begin
    delta = datetime.timedelta(days=1)
    while d <= end:
        # input the formated one day data
        GJLv_To_Txt(fp, zmw, d.strftime("%Y%m%d"))
        d += delta

    # close the output file
    fp.close()

    print '\n..............................'
    print '\n.......Game end...............\n'

def GJLv_To_Excel(writer, zmw, thedate):
   
    url = 'http://api.wunderground.com/api/%s/history_%s/q/zmw:00000.1.%s.json?v=wuiapp'   \
           % (key, thedate, zmw)

    print '\n...........................'
    print 'Date %s, Url: %s' % (thedate, url)
    print '\n...........................\n'

    f = urllib2.urlopen(url)
    json_string = f.read()
    parsed_json = json.loads(json_string)
    observations = parsed_json['history']['observations']

    for item in observations:
        idate = item['date']['pretty']
        iyear = item['date']['year']
        imon = item['date']['mon']
        iday = item['date']['mday']
        itempm = item['tempm']
        idewptm = item['dewptm']
        ihum = item['hum']
        iwspdm = item['wspdm']
        iwgustm = item['wgustm']
        iwdird = item['wdird']
        iwdire = item['wdire']
        ivism = item['vism']
        ipressurem = item['pressurem']
        iprecipm = item['precipm']
        iconds = item['conds']

        # format output date: "12:00 AM CST on January 01, 2017" to "12:00 AM 01/01/2017"
        idate = '%s %s/%s/%s' % (idate[0:idate.find(Date_CST)-1], iyear, imon, iday)

        # format null string to 'N/A'

        # write the line
        writer.writerow([idate,itempm,idewptm,ihum,iwspdm,iwgustm,iwdird,iwdire,ivism,iprecipm,ipressurem,iconds])

    f.close()


def LPQiao_To_Excel():

    print '\n.......Game start...............\n'

    citywmo = raw_input('Please input wmo code: ').strip()
    if not isValidwmo(citywmo):
        print 'Invalid wmo code, please input again......'
        return None

    begin_date_s = raw_input('Please input start date, format is yyyy-mm-dd: ').strip()
    if not isValidDate(begin_date_s):
        print 'Invalid start date, please input again......'
        return None

    end_date_s = raw_input('Please input end date, format is yyyy-mm-dd: ').strip()
    if not isValidDate(end_date_s):
        print 'Invalid end date, please input again......'
        return None

    begin_date = time.strptime(begin_date_s, "%Y-%m-%d")
    end_date = time.strptime(end_date_s, "%Y-%m-%d")
    begin = datetime.date(begin_date[0], begin_date[1], begin_date[2])
    end = datetime.date(end_date[0], end_date[1], end_date[2])

    # open the output file, and input the formated data title
    out_csvfile = '%s-%s.csv' % (citywmo, begin)
    with open(out_csvfile, 'wb') as fp:
        spamwriter = csv.writer(fp, dialect = 'excel')
        spamwriter.writerow(['TimeCST','TemperatureC','Dew PointC','Relative Humidity','Wind Speed','Wind Gust',
                            'WindDirDegress','Wind Direction','Visibility','Sea Level Pressure','Precipitation','Conditions'])


        d = begin
        delta = datetime.timedelta(days=1)
        while d <= end:
            # input the formated one day data
            GJLv_To_Excel(spamwriter, citywmo, d.strftime("%Y%m%d"))
            d += delta

    # close the output file by file context while exit

    print '\n..............................'
    print '\n.......Game end...............\n'


if __name__ == '__main__':

    # To txt file 
    # LPQiao_To_Txt()

    # To csv file
    LPQiao_To_Excel()