用python抓取摩拜单车API数据
最近摩拜单车超级火,下班了各种骑,然后膝盖伤了。。。。。。
-----------------------------------------------------------------------
看到这篇文章http://mp.weixin.qq.com/s?__biz=MzA5NDExMTAzNA==&mid=2649982414&idx=1&sn=68b638c4f019baa3a783c045b294d6de&chksm=8854b19bbf23388dd91509a2a692736a5c0505ab144c5605d4015330044699c20e5c42f92496&mpshare=1&scene=1&srcid=0519Fkgd6ew4TuOHUerHMlZs#rd
说好的给源码,但貌似没给。。。。
然后就自己写了一份
--------------------------------------------------------------------------------------
import requests
import numpy as np
import json
import multiprocessing
from requests.packages.urllib3.exceptions import InsecureRequestWarning
#忽略警告
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
#进程数
PROCESS_NUM = 4
#模拟的数据包头
HEADERS = {
\'charset\':\'utf-8\',
\'platform\':\'4\',
\'referer\':\'https://servicewechat.com/wx40f112341ae33edb/1/\',
\'content-type\':\'application/x-www-form-urlencoded\',
\'user-agent\':\'MicroMessenger/6.5.4.1000 NetType/WIFI Language/zh_CN\',
\'host\':\'mwx.mobike.com\',
\'connection\':\'Keep-Alive\',
\'accept-encoding\':\'gzip\',
\'cache-control\':\'no-cache\'
}
#初始化的部分鹿城区经纬度
LEFT = 28.026735
TOP = 120.644784
RIGHT = 27.99344
BOTTOM = 120.715068
INIT_POS = (TOP, LEFT, BOTTOM, RIGHT)
#将一个区域分成num份,每份后续对应一个进程进行处理
def split_pos(init_pos, num):
result = []
x1 = init_pos[0]
y1 = init_pos[1]
x2 = init_pos[2]
y2 = init_pos[3]
xrange = np.linspace(x1, x2, num + 1)
for i in range(num):
#可以做一些重叠的优化,暂时没做
result.append((xrange[i], y1, xrange[i+1], y2))
return result
#解析并保存数据
#可以直接通过MONGDB操作,这台机器没安装。。
def save(text, output):
try:
json_data = json.loads(text)
except:
print(\'***ERR1*** \', text)
return
if \'object\' not in json_data:
print(\'***ERR2*** \', text)
return
for object in json_data[\'object\']:
obj_values = []
for key in sorted(object.keys()):
try:
if(object[key] == None):
obj_values.append(\'\')
else:
obj_values.append(str(object[key]))
except:
print(\'***ERR3*** \', object)
return
output.write(\'$\'.join(obj_values) + \'\n\')
#爬取摩拜单车的数据,没有对反爬虫的限制进行处理,可以考虑IP池、拨号等手段
#此处只是闹着玩。。
def worker(num, location):
raw_fd = open(\'worker\' + str(num) + \'_rawdata.txt\', \'w\')
tabular_fd = open(\'worker\' + str(num) + \'_tabulardata.txt\', \'w\')
session = requests.session()
url = \'https://mwx.mobike.com/mobike-api/rent/nearbyBikesInfo.do\'
# 模拟登录
postdata = {
\'latitude\': 0,
\'longitude\': 0,
\'errMsg\': \'getMapCenterLocation\'
}
(top, left, bottom, right) = location
#按照经纬度,一个区域一个区域的获取信息
lat_range = np.arange(left, right, -0.002)
for lat in lat_range:
lon_range = np.arange(top, bottom, 0.002)
for lon in lon_range:
postdata[\'latitude\'] = lat
postdata[\'longitude\'] = lon
mobike_data = session.post(url, headers=HEADERS, data=postdata, verify=False)
#print(mobike_data.text)
#保存原始数据,原始数据还是很关键的,最好都保留下来
raw_fd.write(mobike_data.text + \'\n\')
#解析并保存数据,还需要去除,后续单独进行,尽量不要再多进程或者多线程环境下处理
save(mobike_data.text, tabular_fd)
raw_fd.close()
tabular_fd.close()
#主函数入口
if __name__ == \'__main__\':
jobs = []
location = split_pos(INIT_POS, PROCESS_NUM)
#执行多进程,也可以用多线程
for process_id in range(PROCESS_NUM):
p = multiprocessing.Process(target=worker, args=(process_id,location[process_id]))
jobs.append(p)
p.start()