20190603
中包含多少辆出租车🚖?import pandas as pd
df = pd.read_csv('../input/taxiGps20190603.csv')
df['CARNO'].nunique()
# 6727
20190603
中包含多少辆网约车🚗?import pandas as pd
df = pd.read_csv('../input/wycGps20190603.csv')
df['CAR_NO'].nunique()
# 28774
20190603
中上车经纬度的最大最小值?import pandas as pd
df = pd.read_csv('../input/taxiOrder20190603.csv')
df['GETON_LATITUDE'].max(), df['GETON_LONGITUDE'].max(),
# (33.497205, 129.899483)
20190603
中下车经纬度最常见的位置?import pandas as pd
import numpy as np
df = pd.read_csv('../input/wycOrder20190603.csv')
df = df[['DEST_LONGITUDE', 'DEST_LATITUDE']].dropna()
df['DEST_LONGITUDE'] = df['DEST_LONGITUDE'].astype(float)
df['DEST_LATITUDE'] = df['DEST_LATITUDE'].astype(float)
df['DEST_LONGITUDE'] = df['DEST_LONGITUDE'].apply(lambda x: '{0:.3f}'.format(x))
df['DEST_LATITUDE'] = df['DEST_LATITUDE'].apply(lambda x: '{0:.3f}'.format(x))
df['DEST_GPS'] = df['DEST_LONGITUDE'] + '_' + df['DEST_LATITUDE']
df['DEST_GPS'].value_counts().head(1)
输出
118.075_24.635 4879
Name: DEST_GPS, dtype: int64
20190531 - 20190609
期间早上9点的平均速度变化;import pandas as pd
import numpy as np
import glob, gc
paths = glob.glob('../input/taxiGps20190*.csv')
paths.sort()
speed = []
for path in paths:
df = pd.read_csv(path)
df['GPS_TIME'] = pd.to_datetime(df['GPS_TIME'])
df['GPS_TIME_hour'] = df['GPS_TIME'].dt.hour
print(path, df.groupby(['GPS_TIME_hour'])['GPS_SPEED'].mean().loc[9])
speed.append(df.groupby(['GPS_TIME_hour'])['GPS_SPEED'].mean().loc[9])
del df; gc.collect()
输出:
../input/taxiGps20190531.csv 19.391072205164892
../input/taxiGps20190601.csv 18.878925202464238
../input/taxiGps20190602.csv 17.844745282889757
../input/taxiGps20190603.csv 19.200816309072298
../input/taxiGps20190604.csv 18.858726970293013
../input/taxiGps20190605.csv 9.153008752735236
../input/taxiGps20190606.csv 19.24489719731869
../input/taxiGps20190607.csv 18.852925390406494
../input/taxiGps20190608.csv 18.026592133750096
../input/taxiGps20190609.csv 18.275623368334777
可视化:
%pylab inline
plt.plot(range(len(paths)), speed)
_ = plt.xticks(range(len(paths)), [x.split('/')[-1][7:-4] for x in paths], rotation=30)
plt.ylabel('Speed')
plt.savefig('speed.jpg', dpi=200, bbox_inches='tight')
20190531 - 20190609
期间平均每辆巡游车运营时间(一天之内活跃的分钟个数)变化;import pandas as pd
import numpy as np
import glob, gc
paths = glob.glob('../input/taxiGps20190*.csv')
paths.sort()
minute = []
for path in paths:
df = pd.read_csv(path, usecols=['CARNO', 'GPS_TIME'])
df['GPS_TIME'] = df['GPS_TIME'].apply(lambda x: x[:-3])
minute.append(df.groupby(['CARNO'])['GPS_TIME'].nunique().mean())
print(path, minute[-1])
del df; gc.collect()
输出:
../input/taxiGps20190531.csv 973.0809692671395
../input/taxiGps20190601.csv 1038.752221563981
../input/taxiGps20190602.csv 554.1681310498883
../input/taxiGps20190603.csv 888.6511074773301
../input/taxiGps20190604.csv 1140.283096926714
../input/taxiGps20190605.csv 876.1907719609583
../input/taxiGps20190606.csv 948.6421814957139
../input/taxiGps20190607.csv 1374.2632749925838
../input/taxiGps20190608.csv 1371.572085434589
../input/taxiGps20190609.csv 1377.2666368914695
可视化:
%pylab inline
plt.plot(range(len(paths)), minute)
_ = plt.xticks(range(len(paths)), [x.split('/')[-1][7:-4] for x in paths], rotation=30)
plt.ylabel('Minute')
plt.savefig('minute.jpg', dpi=200, bbox_inches='tight')
20190531
出租车在LONGITUDE(118.155060±0.01)、LATITUDE(24.506035±0.01)方位内打车的平均等待时间。import pandas as pd
import numpy as np
import glob
INPUT_PATH = '../input/' #文件目录
MAX_ROWS = None # 文件读取行数
taxiorder2019 = pd.read_csv(INPUT_PATH + 'taxiOrder20190531.csv', nrows=None)
taxiorder2019.loc[
(np.abs(taxiorder2019['GETON_LONGITUDE'] - 118.155060) < 0.01)
&(np.abs(taxiorder2019['GETON_LATITUDE'] - 24.506035) < 0.01)
]['WAITING_TIME'].mean()
20190531
- 20190609
期间出租订单经纬度上平均等待时间长的位置(且位置出现评率大于5)。import pandas as pd
import numpy as np
import glob
INPUT_PATH = '../input/' #文件目录
MAX_ROWS = None # 文件读取行数
taxiorder2019 = pd.concat([
pd.read_csv(x, nrows=None)
for x in glob.glob('../input/taxiOrder2019*.csv')
])
taxiorder2019['GETON_LONGITUDE'] = taxiorder2019['GETON_LONGITUDE'].apply(lambda x: '{0:.3f}'.format(x))
taxiorder2019['GETON_LATITUDE'] = taxiorder2019['GETON_LATITUDE'].apply(lambda x: '{0:.3f}'.format(x))
taxiorder2019['DEST_GPS'] = taxiorder2019['GETON_LONGITUDE'] + '_' + taxiorder2019['GETON_LATITUDE']
legal_gps = (taxiorder2019['DEST_GPS'].value_counts() > 5).index
legal_gps_watittime = taxiorder2019[taxiorder2019['DEST_GPS'].isin(legal_gps)].groupby(['DEST_GPS'])['WAITING_TIME'].mean()
legal_gps_watittime.sort_values(ascending=False).iloc[0]
import pandas as pd
import numpy as np
import glob
INPUT_PATH = '../input/' #文件目录
MAX_ROWS = None # 文件读取行数
taxiorder2019 = pd.concat([
pd.read_csv(x, nrows=None)
for x in ['../input/taxiOrder20190607.csv',
'../input/taxiOrder20190608.csv',
'../input/taxiOrder20190609.csv']
])
print(taxiorder2019['WAITING_TIME'].mean())
# 243.93177472175148
taxiorder2020 = pd.concat([
pd.read_csv(x, nrows=None)
for x in ['../input/taxiOrder20200625.csv',
'../input/taxiOrder20200626.csv',
'../input/taxiOrder20200627.csv']
])
print(taxiorder2020['WAITING_TIME'].mean())
# 215.82923713770793
import pandas as pd
import numpy as np
import glob
INPUT_PATH = '../input/' #文件目录
taxigps2019 = pd.read_csv(INPUT_PATH + 'taxiGps20190606.csv', nrows=None)
print(taxigps2019['GPS_SPEED'].mean())
# 18.856870020906733
taxigps2019 = pd.read_csv(INPUT_PATH + 'taxiGps20200624.csv', nrows=None)
print(taxigps2019['GPS_SPEED'].mean())
# 16.422890546040616
2019端午假期前,巡游车的日平均速度为18.12km/h,平均速度的整体趋势向上,在2019年端午假期中,巡游车的平均速度为18.04km/h,整体趋势向下,在6月7日这天速度最快,达到了18.8km/h,在6月8日这天速度最慢,只有17.6km/h。
geohash
发掘2019年端午假期,网约车热门的下车地点(Top3);import pandas as pd
import numpy as np
import glob
import geohash
INPUT_PATH = '../input/' #文件目录
MAX_ROWS = None # 文件读取行数
wycorder2019 = pd.concat([
pd.read_csv(x, nrows=None)
for x in ['../input/wycOrder20190607.csv',
'../input/wycOrder20190608.csv',
'../input/wycOrder20190609.csv']
])
wycorder2019['geohash'] = wycorder2019.apply(lambda x:
geohash.encode(x['DEST_LATITUDE'], x['DEST_LONGITUDE'], precision=8),
axis=1)
wycorder2019['geohash'].value_counts().head(3)
geohash
发掘2019年端午假期,网约车热门的打车(上车)地点(Top3);import pandas as pd
import numpy as np
import glob
import geohash
INPUT_PATH = '../input/' #文件目录
MAX_ROWS = None # 文件读取行数
wycorder2019 = pd.concat([
pd.read_csv(x, nrows=None)
for x in ['../input/wycOrder20190607.csv',
'../input/wycOrder20190608.csv',
'../input/wycOrder20190609.csv']
])
wycorder2019['geohash'] = wycorder2019.apply(lambda x:
geohash.encode(x['DEP_LATITUDE'], x['DEP_LONGITUDE'], precision=8),
axis=1)
wycorder2019['geohash'].value_counts().head(3)
geohash
分别发掘2019和2020年端午假期,网约车打车预计上车时间
到上车时间
时间最长且上车地点出现次数大于100的地点(两年各自的Top1)。import pandas as pd
import numpy as np
import glob
import geohash
INPUT_PATH = '../input/' #文件目录
MAX_ROWS = None # 文件读取行数
wycorder2019 = pd.concat([
pd.read_csv(x, nrows=None)
for x in ['../input/wycOrder20190607.csv',
'../input/wycOrder20190608.csv',
'../input/wycOrder20190609.csv']
])
wycorder2019['geohash'] = wycorder2019.apply(lambda x:
geohash.encode(x['DEP_LATITUDE'], x['DEP_LONGITUDE'], precision=8),
axis=1)
wycorder2019['DEP_TIME'] = pd.to_datetime(wycorder2019['DEP_TIME'], format='%Y%m%d%H%M%S')
wycorder2019['BOOK_DEP_TIME'] = pd.to_datetime(wycorder2019['BOOK_DEP_TIME'], format='%Y%m%d%H%M%S')
wycorder2019['WAITING_TIME'] = (wycorder2019['DEP_TIME'] - wycorder2019['BOOK_DEP_TIME']).dt.seconds
wycorder2019[wycorder2019['geohash'].isin(
(wycorder2019['geohash'].value_counts() > 100).index)
].groupby('geohash')['WAITING_TIME'].mean().sort_values().tail(1)
import pandas as pd
import numpy as np
import glob
import geohash
INPUT_PATH = '../input/' #文件目录
MAX_ROWS = None # 文件读取行数
wycorder2020 = pd.concat([
pd.read_csv(x, nrows=None, sep='\\')
for x in ['../input/wycOrder20200625.csv',
'../input/wycOrder20200626.csv',
'../input/wycOrder20200627.csv']
])
wycorder2020['geohash'] = wycorder2020.apply(lambda x:
geohash.encode(x['DEP_LATITUDE'], x['DEP_LONGITUDE'], precision=8),
axis=1)
wycorder2020['DEP_TIME'] = pd.to_datetime(wycorder2020['DEP_TIME'], format='%Y%m%d%H%M%S')
wycorder2020['BOOK_DEP_TIME'] = pd.to_datetime(wycorder2020['BOOK_DEP_TIME'], format='%Y%m%d%H%M%S')
wycorder2020['WAITING_TIME'] = (wycorder2020['DEP_TIME'] - wycorder2020['BOOK_DEP_TIME']).dt.seconds
wycorder2020[wycorder2020['geohash'].isin(
(wycorder2020['geohash'].value_counts() > 100).index)
].groupby('geohash')['WAITING_TIME'].mean().sort_values().tail(1)
© 2019-2023 coggle.club 版权所有 京ICP备20022947 京公网安备 11030102010643号