摩拜单车之投放预测

[TOC]

划分预测区域

经过分析,发现共享单车活动区域集中在经度121.25和121.65之间,维度集中在31.05和31.45之间。

因此在此范围内,将区域划分成5*5的小方格,作为各预测区域。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
def divid_to_grid(lon, lat, rows=5, cols=5):
from decimal import Decimal
low_lon = Decimal('121.25') # Decimal.from_float(float)并不能解决计算精度问题
high_lon = Decimal('121.65')
low_lat = Decimal('31.05')
high_lat = Decimal('31.45')
lon_step = (high_lon - low_lon) / cols
lat_step = (high_lat - low_lat) / rows
if (lon<low_lon) | (lon>high_lon) | (lat<low_lat) | (lat>high_lat):
return None
else:
# 从低经高纬开始标号
pos_x = low_lon
pos_y = high_lat
label = 0
while lon >= pos_x: # 至少执行一次,标号从1开始
label = label + 1
pos_x = pos_x + lon_step
while lat <= pos_y:
label = label + cols
pos_y = pos_y - lat_step
return label-cols # 减去多走的cols

# print(divid_to_grid(121.525, 31.309)) # 9
# print(divid_to_grid(121.386, 31.206)) # 17
# print(divid_to_grid(121.427, 31.260)) # 13
# print(divid_to_grid(121.242, 31.199)) # None
ds['area'] = ds.apply(lambda item: divid_to_grid(item['start_location_x'], item['start_location_y']), axis=1)

各个区域在各时段的需求统计

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# 生成 2016/8/1 00:00:00 ~ 2016/8/31 23:23:59每隔30分钟的date
def timeSplit_getFlow(mobike, interval=30):
mobike['start_time'] = pd.to_datetime(mobike['start_time'])
interval = str(interval/60)
arr_divide = pd.date_range(start="2016-08-01 00:00", end="2016-09-01 00:00", freq=interval+"H")
## 时间间隔的描述
dic={}
interval = []
for i in range(len(arr_divide)-1):
str_dur = str(arr_divide[i]) + '~' + str(arr_divide[i+1])
interval.append(str_dur)
dic['interval'] = interval
## 计算聚类的各区域各时段单车流量
for j in range(1, 26):
demo_area = mobike[mobike['area']==j]
lis = []
for i in range(len(arr_divide)-1): # 该区域的各时段
condition = (demo_area['start_time']>=arr_divide[i]) & (demo_area['start_time']<arr_divide[i+1])
lis.append(demo_area[condition].shape[0]) # df.shape[0]行数
dic[j] = lis
# 转化为DataFrmae类型
df_mobike = pd.DataFrame(dic)
df_mobike.to_csv('mobike_tongji.csv', index=False)
return df_mobike
timeSplit_getFlow(ds)

搭建模型

模型图如下

1
2
3
4
5
6
7
8
9
10
from keras import models
from keras import layers
network = models.Sequential()
network.add(layers.ConvLSTM2D(filters=16, kernel_size=(2, 2), padding='same', input_shape=(2, 5, 5, 1), return_sequences=True))
network.add(layers.ConvLSTM2D(filters=16, kernel_size=(2, 2), padding='same', return_sequences=False))
network.add(layers.Flatten())
network.add(layers.Dense(512, activation='relu'))
network.add(layers.Dense(25, activation='sigmoid'))
network.compile(optimizer='adam', loss='mse')
network.summary()

汇总25各区域的误差平均:

1
2
3
import numpy as np
mse = np.sum(np.mean((test_y-pred_y)**2, axis=0)) # 先求各区域的平均误差列表, 在汇总为总误差
print('mse:', mse) # mse: 222.82507610476117