任务参考答案


## 任务1

  1. 统计巡游车GPS数据在20190603中包含多少辆出租车🚖?
import pandas as pd
df = pd.read_csv('../input/taxiGps20190603.csv')
df['CARNO'].nunique()
# 6727
  1. 统计网约车GPS数据在20190603中包含多少辆网约车🚗?
import pandas as pd
df = pd.read_csv('../input/wycGps20190603.csv')
df['CAR_NO'].nunique()
# 28774
  1. 统计巡游车订单数据在20190603中上车经纬度的最大最小值?
import pandas as pd
df = pd.read_csv('../input/taxiOrder20190603.csv')
df['GETON_LATITUDE'].max(), df['GETON_LONGITUDE'].max(),
# (33.497205, 129.899483)
  1. 统计网约车订单数据集在20190603中下车经纬度最常见的位置?
    • 假设经度+维度,各保留三维有效数字组合得到具体位置
    • 小提示:可以将经纬度拼接到一起进行统计
import pandas as pd
import numpy as np
df = pd.read_csv('../input/wycOrder20190603.csv')
df = df[['DEST_LONGITUDE', 'DEST_LATITUDE']].dropna()

df['DEST_LONGITUDE'] = df['DEST_LONGITUDE'].astype(float)
df['DEST_LATITUDE'] = df['DEST_LATITUDE'].astype(float)

df['DEST_LONGITUDE'] = df['DEST_LONGITUDE'].apply(lambda x: '{0:.3f}'.format(x))
df['DEST_LATITUDE'] = df['DEST_LATITUDE'].apply(lambda x: '{0:.3f}'.format(x))

df['DEST_GPS'] = df['DEST_LONGITUDE'] + '_' + df['DEST_LATITUDE']
df['DEST_GPS'].value_counts().head(1)

输出

118.075_24.635    4879
Name: DEST_GPS, dtype: int64

## 任务2

  1. 可视化巡游车20190531 - 20190609期间早上9点的平均速度变化;
import pandas as pd
import numpy as np
import glob, gc

paths = glob.glob('../input/taxiGps20190*.csv')
paths.sort()

speed = []
for path in paths:
    df = pd.read_csv(path)
    df['GPS_TIME'] = pd.to_datetime(df['GPS_TIME'])
    df['GPS_TIME_hour'] = df['GPS_TIME'].dt.hour
    print(path, df.groupby(['GPS_TIME_hour'])['GPS_SPEED'].mean().loc[9])
    speed.append(df.groupby(['GPS_TIME_hour'])['GPS_SPEED'].mean().loc[9])
    del df; gc.collect()

输出:

../input/taxiGps20190531.csv 19.391072205164892
../input/taxiGps20190601.csv 18.878925202464238
../input/taxiGps20190602.csv 17.844745282889757
../input/taxiGps20190603.csv 19.200816309072298
../input/taxiGps20190604.csv 18.858726970293013
../input/taxiGps20190605.csv 9.153008752735236
../input/taxiGps20190606.csv 19.24489719731869
../input/taxiGps20190607.csv 18.852925390406494
../input/taxiGps20190608.csv 18.026592133750096
../input/taxiGps20190609.csv 18.275623368334777

可视化:

%pylab inline
plt.plot(range(len(paths)), speed)
_ = plt.xticks(range(len(paths)), [x.split('/')[-1][7:-4] for x in paths], rotation=30)
plt.ylabel('Speed')
plt.savefig('speed.jpg', dpi=200, bbox_inches='tight')

  1. 可视化统计巡游车20190531 - 20190609期间平均每辆巡游车运营时间(一天之内活跃的分钟个数)变化;
import pandas as pd
import numpy as np
import glob, gc

paths = glob.glob('../input/taxiGps20190*.csv')
paths.sort()

minute = []
for path in paths:
    df = pd.read_csv(path, usecols=['CARNO', 'GPS_TIME'])
    df['GPS_TIME'] = df['GPS_TIME'].apply(lambda x: x[:-3])
    minute.append(df.groupby(['CARNO'])['GPS_TIME'].nunique().mean())

    print(path, minute[-1])
    del df; gc.collect()

输出:

../input/taxiGps20190531.csv 973.0809692671395
../input/taxiGps20190601.csv 1038.752221563981
../input/taxiGps20190602.csv 554.1681310498883
../input/taxiGps20190603.csv 888.6511074773301
../input/taxiGps20190604.csv 1140.283096926714
../input/taxiGps20190605.csv 876.1907719609583
../input/taxiGps20190606.csv 948.6421814957139
../input/taxiGps20190607.csv 1374.2632749925838
../input/taxiGps20190608.csv 1371.572085434589
../input/taxiGps20190609.csv 1377.2666368914695

可视化:

%pylab inline
plt.plot(range(len(paths)), minute)
_ = plt.xticks(range(len(paths)), [x.split('/')[-1][7:-4] for x in paths], rotation=30)
plt.ylabel('Minute')
plt.savefig('minute.jpg', dpi=200, bbox_inches='tight')


## 任务3

  1. 统计20190531出租车在LONGITUDE(118.155060±0.01)、LATITUDE(24.506035±0.01)方位内打车的平均等待时间。
import pandas as pd
import numpy as np
import glob

INPUT_PATH = '../input/' #文件目录
MAX_ROWS = None # 文件读取行数

taxiorder2019 = pd.read_csv(INPUT_PATH + 'taxiOrder20190531.csv', nrows=None)
taxiorder2019.loc[
    (np.abs(taxiorder2019['GETON_LONGITUDE'] - 118.155060) < 0.01)
    &(np.abs(taxiorder2019['GETON_LATITUDE'] - 24.506035) < 0.01)
]['WAITING_TIME'].mean()
  1. 统计20190531 - 20190609 期间出租订单经纬度上平均等待时间长的位置(且位置出现评率大于5)。
import pandas as pd
import numpy as np
import glob

INPUT_PATH = '../input/' #文件目录
MAX_ROWS = None # 文件读取行数

taxiorder2019 = pd.concat([
    pd.read_csv(x, nrows=None) 
    for x in glob.glob('../input/taxiOrder2019*.csv')
])

taxiorder2019['GETON_LONGITUDE'] = taxiorder2019['GETON_LONGITUDE'].apply(lambda x: '{0:.3f}'.format(x))
taxiorder2019['GETON_LATITUDE'] = taxiorder2019['GETON_LATITUDE'].apply(lambda x: '{0:.3f}'.format(x))
taxiorder2019['DEST_GPS'] = taxiorder2019['GETON_LONGITUDE'] + '_' + taxiorder2019['GETON_LATITUDE']

legal_gps = (taxiorder2019['DEST_GPS'].value_counts() > 5).index
legal_gps_watittime = taxiorder2019[taxiorder2019['DEST_GPS'].isin(legal_gps)].groupby(['DEST_GPS'])['WAITING_TIME'].mean()
legal_gps_watittime.sort_values(ascending=False).iloc[0]
  1. 对比2019年和2020年出租车端午节订单的平均等待时间,是如何变化的(上升、下降还是不变)?
import pandas as pd
import numpy as np
import glob

INPUT_PATH = '../input/' #文件目录
MAX_ROWS = None # 文件读取行数

taxiorder2019 = pd.concat([
    pd.read_csv(x, nrows=None) 
    for x in ['../input/taxiOrder20190607.csv',
             '../input/taxiOrder20190608.csv',
             '../input/taxiOrder20190609.csv']
])
print(taxiorder2019['WAITING_TIME'].mean())
# 243.93177472175148

taxiorder2020 = pd.concat([
    pd.read_csv(x, nrows=None) 
    for x in ['../input/taxiOrder20200625.csv',
             '../input/taxiOrder20200626.csv',
             '../input/taxiOrder20200627.csv']
])
print(taxiorder2020['WAITING_TIME'].mean())
# 215.82923713770793

## 任务4

  1. 对比分析2019与2020年端午假期前一天(🤔分别对应哪一天?),巡游车日平均速度变化,上升还是下降?
import pandas as pd
import numpy as np
import glob

INPUT_PATH = '../input/' #文件目录

taxigps2019 = pd.read_csv(INPUT_PATH + 'taxiGps20190606.csv', nrows=None)
print(taxigps2019['GPS_SPEED'].mean())
# 18.856870020906733

taxigps2019 = pd.read_csv(INPUT_PATH + 'taxiGps20200624.csv', nrows=None)
print(taxigps2019['GPS_SPEED'].mean())
# 16.422890546040616
  1. 对比分析2019年端午假期前、端午假期中和假期后,巡游车日平均速度变化,变化趋势是?

2019端午假期前,巡游车的日平均速度为18.12km/h,平均速度的整体趋势向上,在2019年端午假期中,巡游车的平均速度为18.04km/h,整体趋势向下,在6月7日这天速度最快,达到了18.8km/h,在6月8日这天速度最慢,只有17.6km/h。


## 任务5

  1. 使用geohash发掘2019年端午假期,网约车热门的下车地点(Top3);
import pandas as pd
import numpy as np
import glob
import geohash

INPUT_PATH = '../input/' #文件目录
MAX_ROWS = None # 文件读取行数

wycorder2019 = pd.concat([
    pd.read_csv(x, nrows=None) 
    for x in ['../input/wycOrder20190607.csv',
             '../input/wycOrder20190608.csv',
             '../input/wycOrder20190609.csv']
])
wycorder2019['geohash'] = wycorder2019.apply(lambda x: 
                    geohash.encode(x['DEST_LATITUDE'], x['DEST_LONGITUDE'], precision=8), 
axis=1)
wycorder2019['geohash'].value_counts().head(3)
  1. 使用geohash发掘2019年端午假期,网约车热门的打车(上车)地点(Top3);
import pandas as pd
import numpy as np
import glob
import geohash

INPUT_PATH = '../input/' #文件目录
MAX_ROWS = None # 文件读取行数

wycorder2019 = pd.concat([
    pd.read_csv(x, nrows=None) 
    for x in ['../input/wycOrder20190607.csv',
             '../input/wycOrder20190608.csv',
             '../input/wycOrder20190609.csv']
])
wycorder2019['geohash'] = wycorder2019.apply(lambda x: 
                    geohash.encode(x['DEP_LATITUDE'], x['DEP_LONGITUDE'], precision=8), 
axis=1)
wycorder2019['geohash'].value_counts().head(3)
  1. 使用geohash分别发掘2019和2020年端午假期,网约车打车预计上车时间上车时间时间最长且上车地点出现次数大于100的地点(两年各自的Top1)。
import pandas as pd
import numpy as np
import glob
import geohash

INPUT_PATH = '../input/' #文件目录
MAX_ROWS = None # 文件读取行数

wycorder2019 = pd.concat([
    pd.read_csv(x, nrows=None) 
    for x in ['../input/wycOrder20190607.csv',
             '../input/wycOrder20190608.csv',
             '../input/wycOrder20190609.csv']
])

wycorder2019['geohash'] = wycorder2019.apply(lambda x: 
                    geohash.encode(x['DEP_LATITUDE'], x['DEP_LONGITUDE'], precision=8), 
axis=1)

wycorder2019['DEP_TIME'] = pd.to_datetime(wycorder2019['DEP_TIME'], format='%Y%m%d%H%M%S')
wycorder2019['BOOK_DEP_TIME'] = pd.to_datetime(wycorder2019['BOOK_DEP_TIME'], format='%Y%m%d%H%M%S')
wycorder2019['WAITING_TIME'] = (wycorder2019['DEP_TIME'] - wycorder2019['BOOK_DEP_TIME']).dt.seconds

wycorder2019[wycorder2019['geohash'].isin(
    (wycorder2019['geohash'].value_counts() > 100).index)
].groupby('geohash')['WAITING_TIME'].mean().sort_values().tail(1)
import pandas as pd
import numpy as np
import glob
import geohash

INPUT_PATH = '../input/' #文件目录
MAX_ROWS = None # 文件读取行数

wycorder2020 = pd.concat([
    pd.read_csv(x, nrows=None, sep='\\') 
    for x in ['../input/wycOrder20200625.csv',
             '../input/wycOrder20200626.csv',
             '../input/wycOrder20200627.csv']
])

wycorder2020['geohash'] = wycorder2020.apply(lambda x: 
                    geohash.encode(x['DEP_LATITUDE'], x['DEP_LONGITUDE'], precision=8), 
axis=1)

wycorder2020['DEP_TIME'] = pd.to_datetime(wycorder2020['DEP_TIME'], format='%Y%m%d%H%M%S')
wycorder2020['BOOK_DEP_TIME'] = pd.to_datetime(wycorder2020['BOOK_DEP_TIME'], format='%Y%m%d%H%M%S')
wycorder2020['WAITING_TIME'] = (wycorder2020['DEP_TIME'] - wycorder2020['BOOK_DEP_TIME']).dt.seconds

wycorder2020[wycorder2020['geohash'].isin(
    (wycorder2020['geohash'].value_counts() > 100).index)
].groupby('geohash')['WAITING_TIME'].mean().sort_values().tail(1)


© 2019-2023 coggle.club 版权所有     京ICP备20022947    京公网安备 11030102010643号