摩拜单车之数据分析

参考论文:Short-term FFBS demand prediction with multi-source data in a hybrid deep learning framework

数据集来自:摩拜单车2016年8月份的订单数据

[TOC]

数据包括:订单id、用户id、单车id、取车时间戳、取车经纬度、停车时间、停车经纬度、轨迹信息等。

剔除异常值

剔除时间>10h,骑行距离超过50km的数据。因此首先要计算骑行时间和骑行距离

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# 计算骑行距离
def geodistance(lng1, lat1, lng2, lat2):
from math import radians, cos, sin, asin, sqrt
lng1_r, lat1_r, lng2_r, lat2_r = map(radians, [lng1, lat1, lng2, lat2]) # 经纬度转换成弧度
dlon = lng1_r - lng2_r
dlat = lat1_r - lat2_r
dis = sin(dlat/2)**2 + cos(lat1_r) * cos(lat2_r) * sin(dlon/2)**2
distance = 2 * asin(sqrt(dis)) * 6371 * 1000 # 地球平均半径为6371km
distance = round(distance/1000, 3)
return distance
def get_dis(item):
item['distance'] = geodistance(item['start_location_x'], item['start_location_y'], item['end_location_x'], item['end_location_y'])
return item
# 剔除异常数据
def eli_abn_data():
import pandas as pd
import numpy as np
mobike = pd.read_csv("mobike_shanghai.csv")
# 修改日期类型
mobike['start_time'] = pd.to_datetime(mobike['start_time'])
mobike['end_time'] = pd.to_datetime(mobike['end_time'])
mobike['daytype'] = mobike['start_time'].apply(lambda x: x.isoweekday())
# 计算骑行时间(分钟)
mobike['duration'] = mobike['end_time'] - mobike['start_time'] #Timedelta('5 days 13:08:00')
mobike['dur_day'] = mobike['duration'].apply(lambda x:str(x).split(' ')[0])
mobike['dur_hr'] = mobike['duration'].apply(lambda x:str(x).split(' ')[-1][:2])
mobike['dur_min'] = mobike['duration'].apply(lambda x:str(x).split(':')[-2])
tobeint = ['dur_day', 'dur_hr', 'dur_min']
mobike[tobeint] = mobike[tobeint].astype('int')
mobike['ttl_min'] = mobike['dur_day']*24*60 + mobike['dur_hr']*60 + mobike['dur_min']
mobike['distance'] = np.nan
mobike = mobike.apply(get_dis, axis=1) #axis=0代表跨行,axis=1代表跨列
# 剔除异常数据:时间 > 10h, 距离 > 50km
mobike = mobike[(mobike['ttl_min']<600) & (mobike['distance']<50)]
mobike.to_csv('mobike.csv', index=False)
return mobike
eli_abn_data()

骑行时间分布可视化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import matplotlib.pyplot as plt
%matplotlib inline
# 标注是工作日还是周末
mobike['dayid'] = mobike['start_time'].apply(lambda x:x.isoweekday()) #isoweekday:星期一返回1,星期二返回2....
mobike['daytype'] = mobike['dayid'].apply(lambda x:'weekends' if x==6 or x==7 else 'weekdays')
mobike['hourid'] = mobike['start_time'].apply(lambda x:x.timetuple().tm_hour)
mobike.to_csv('mobike.csv', index=False)
# 画出每天各时段的骑行量
def DayiHour(i):
return mobike[mobike['dayid']==i]['hourid'].value_counts().sort_index()
plt.figure(figsize=(10, 5))
plt.plot(DayiHour(1), label='Mon', lw=1, c='grey', marker='s')
plt.plot(DayiHour(2), label='Tue', lw=1, c='red', marker='o')
plt.plot(DayiHour(3), label='Wed', lw=1, c='orange', marker='*')
plt.plot(DayiHour(4), label='Thu', lw=1, c='green', marker='+')
plt.plot(DayiHour(5), label='Fri', lw=1, c='blue', marker='p')
plt.plot(DayiHour(6), label='Sat', lw=1, c='purple', marker='x')
plt.plot(DayiHour(7), label='Sun', lw=1, c='hotpink', marker='^')
plt.xticks([i for i in range(24)])
plt.grid(True)
plt.legend() #显示图例
plt.savefig('Time Distribution.jpg', dpi=800)