Python實現網路爬蟲
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# Author: GuangJun.Lv
# Date: 2018/07/06
import urllib2
import json
import os
import time
import datetime
import csv
ts = '\t'
Non = 'N/A'
Date_CST = 'CST'
out_txtfile = '/opt/data.txt'
out_csvfile = '/opt/data.csv'
key = '2b0d6572c90d3e4a'
#zmw = '58362'
def isValidDate(date):
try:
time.strptime(date, "%Y-%m-%d")
return True
except ValueError:
return False
def isValidwmo(wmo):
if not len(wmo):
return False
thedate = '20170101'
url = 'http://api.wunderground.com/api/%s/history_%s/q/zmw:00000.1.%s.json?v=wuiapp' \
% (key, thedate, wmo.strip())
f = urllib2.urlopen(url)
json_string = f.read()
parsed_json = json.loads(json_string)
if not len(parsed_json):
f.close()
return False
zmwr = parsed_json['history']['observations']
if not len(zmwr):
f.close()
return False
f.close()
return True
def get_zmwcode(cityname):
zmw = ''
if not len(cityname):
return zmw
url = 'http://autocomplete.wunderground.com/aq?format=JSON&lang=zh&query=%s' % (cityname)
f = urllib2.urlopen(url)
json_string = f.read()
parsed_json = json.loads(json_string)
if not len(parsed_json):
f.close()
return zmw
zmwr = parsed_json['RESULTS']
if not len(zmwr):
f.close()
return zmw
zmw = parsed_json['RESULTS'][0]['zmw']
f.close()
return zmw
def GJLv_To_Txt(fp, zmw, thedate):
url = 'http://api.wunderground.com/api/%s/history_%s/q/zmw:%s.json?v=wuiapp' \
% (key, thedate, zmw)
print '\n...........................'
print 'Date %s, Url: %s' % (thedate, url)
print '\n...........................\n'
f = urllib2.urlopen(url)
json_string = f.read()
parsed_json = json.loads(json_string)
observations = parsed_json['history']['observations']
for item in observations:
idate = item['date']['pretty']
iyear = item['date']['year']
imon = item['date']['mon']
iday = item['date']['mday']
itempm = item['tempm']
idewptm = item['dewptm']
ihum = item['hum']
iwspdm = item['wspdm']
iwgustm = item['wgustm']
iwdird = item['wdird']
iwdire = item['wdire']
ivism = item['vism']
ipressurem = item['pressurem']
iprecipm = item['precipm']
iconds = item['conds']
# format output date: "12:00 AM CST on January 01, 2017" to "12:00 AM 01/01/2017"
idate = '%s %s/%s/%s' % (idate[0:idate.find(Date_CST)-1], iyear, imon, iday)
# format null string to 'N/A'
linedata = '%s%c%s%c%s%c%s%c%s%c%s%c%s%c%s%c%s%c%s%c%s%c%s%c\n' \
% (idate,ts,itempm,ts,idewptm,ts,ihum,ts,iwspdm,ts, \
iwgustm,ts,iwdird,ts,iwdire,ts,ivism,ts,iprecipm,ts,ipressurem,ts,iconds,ts)
# write the line data
fp.write(linedata)
f.close()
def LPQiao_To_Txt():
print '\n.......Game start...............\n'
city = raw_input('請輸入城市名稱: ')
zmw = get_zmwcode(city)
if not len(zmw):
print '輸入城市錯誤,請重新輸入......'
return None
begin_date_s = raw_input('請輸入起始日期,格式為yyyy-mm-dd: ')
if not isValidDate(begin_date_s):
print '輸入日期錯誤,請重新輸入......'
return None
end_date_s = raw_input('請輸入結束日期,格式為yyyy-mm-dd: ')
if not isValidDate(end_date_s):
print '結束日期錯誤,請重新輸入......'
return None
# open the output file, and input the formated data title
fp = open(out_txtfile, 'w')
title1 = 'TimeCST%cTemperatureC%cDew PointC%cRelative humidity%cWind Speed%cWind Gust%c' % (ts,ts,ts,ts,ts,ts)
title2 = 'WindDirDegrees%cWind Direction%cVisibility%cSea Level Pressure%cPrecipitation%cConditions\n ' % (ts,ts,ts,ts,ts)
fp.write(title1)
fp.write(title2)
begin_date = time.strptime(begin_date_s, "%Y-%m-%d")
end_date = time.strptime(end_date_s, "%Y-%m-%d")
begin = datetime.date(begin_date[0], begin_date[1], begin_date[2])
end = datetime.date(end_date[0], end_date[1], end_date[2])
d = begin
delta = datetime.timedelta(days=1)
while d <= end:
# input the formated one day data
GJLv_To_Txt(fp, zmw, d.strftime("%Y%m%d"))
d += delta
# close the output file
fp.close()
print '\n..............................'
print '\n.......Game end...............\n'
def GJLv_To_Excel(writer, zmw, thedate):
url = 'http://api.wunderground.com/api/%s/history_%s/q/zmw:00000.1.%s.json?v=wuiapp' \
% (key, thedate, zmw)
print '\n...........................'
print 'Date %s, Url: %s' % (thedate, url)
print '\n...........................\n'
f = urllib2.urlopen(url)
json_string = f.read()
parsed_json = json.loads(json_string)
observations = parsed_json['history']['observations']
for item in observations:
idate = item['date']['pretty']
iyear = item['date']['year']
imon = item['date']['mon']
iday = item['date']['mday']
itempm = item['tempm']
idewptm = item['dewptm']
ihum = item['hum']
iwspdm = item['wspdm']
iwgustm = item['wgustm']
iwdird = item['wdird']
iwdire = item['wdire']
ivism = item['vism']
ipressurem = item['pressurem']
iprecipm = item['precipm']
iconds = item['conds']
# format output date: "12:00 AM CST on January 01, 2017" to "12:00 AM 01/01/2017"
idate = '%s %s/%s/%s' % (idate[0:idate.find(Date_CST)-1], iyear, imon, iday)
# format null string to 'N/A'
# write the line
writer.writerow([idate,itempm,idewptm,ihum,iwspdm,iwgustm,iwdird,iwdire,ivism,iprecipm,ipressurem,iconds])
f.close()
def LPQiao_To_Excel():
print '\n.......Game start...............\n'
citywmo = raw_input('Please input wmo code: ').strip()
if not isValidwmo(citywmo):
print 'Invalid wmo code, please input again......'
return None
begin_date_s = raw_input('Please input start date, format is yyyy-mm-dd: ').strip()
if not isValidDate(begin_date_s):
print 'Invalid start date, please input again......'
return None
end_date_s = raw_input('Please input end date, format is yyyy-mm-dd: ').strip()
if not isValidDate(end_date_s):
print 'Invalid end date, please input again......'
return None
begin_date = time.strptime(begin_date_s, "%Y-%m-%d")
end_date = time.strptime(end_date_s, "%Y-%m-%d")
begin = datetime.date(begin_date[0], begin_date[1], begin_date[2])
end = datetime.date(end_date[0], end_date[1], end_date[2])
# open the output file, and input the formated data title
out_csvfile = '%s-%s.csv' % (citywmo, begin)
with open(out_csvfile, 'wb') as fp:
spamwriter = csv.writer(fp, dialect = 'excel')
spamwriter.writerow(['TimeCST','TemperatureC','Dew PointC','Relative Humidity','Wind Speed','Wind Gust',
'WindDirDegress','Wind Direction','Visibility','Sea Level Pressure','Precipitation','Conditions'])
d = begin
delta = datetime.timedelta(days=1)
while d <= end:
# input the formated one day data
GJLv_To_Excel(spamwriter, citywmo, d.strftime("%Y%m%d"))
d += delta
# close the output file by file context while exit
print '\n..............................'
print '\n.......Game end...............\n'
if __name__ == '__main__':
# To txt file
# LPQiao_To_Txt()
# To csv file
LPQiao_To_Excel()