任务4:订单数据统计


## 学习目标

  • 网约车vs出租车订单价格对比
  • 网约车vs出租车订单距离对比
  • 网约车vs出租车时间对比

具体来说,希望回答什么以下问题:

  • 什么情况下网约车订单比出租车订单多?
  • 在同等条件下网约车与出租车价格对比;

## 订单数据统计

在订单数据中,我们希望完成以下统计:

  • 巡游车订单距离与时间的对比(2019年与2020年);
  • 巡游车空驶率对比(2019年与2020年);
  • 网约车订单距离与时间的对比(2019年与2020年);
  • 网约车空驶率对比(2019年与2020年);
  1. 巡游车订单距离与时间的对比(2019年与2020年)
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation

import matplotlib.pyplot as plt
from matplotlib import animation

def barlist(n): 
    taxiorder2019 = pd.read_csv(paths[n], nrows=None,
                                   dtype = {
                                       'GETON_LONGITUDE': np.float32,
                                       'GETON_LATITUDE': np.float32,
                                       'GETOFF_LONGITUDE': np.float32,
                                       'GETOFF_LATITUDE': np.float32,
                                       'PASS_MILE': np.float16,
                                       'NOPASS_MILE': np.float16,
                                       'WAITING_TIME': np.float16
                                   })

    taxiorder2019['GETON_DATE'] = pd.to_datetime(taxiorder2019['GETON_DATE'])
    taxiorder2019['GETON_Hour'] = taxiorder2019['GETON_DATE'].dt.hour

    return taxiorder2019.groupby(['GETON_Hour'])['PASS_MILE'].mean().values

fig=plt.figure()

paths = glob.glob('../input/taxiOrder20190*.csv')
paths.sort()
n = len(paths) #Number of frames
x = range(24)
barcollection = plt.bar(x,barlist(0))
plt.ylim(0, 8)

def animate(i):
    print(i)
    y=barlist(i+1)
    for idx, b in enumerate(barcollection):
        b.set_height(y[idx])
    plt.ylim(0, 8)

    print(i+1)
    plt.title(paths[i+1].split('/')[-1])
    plt.ylabel('PASS_MILE / KM')
    plt.xlabel('Hour')

anim=animation.FuncAnimation(fig,animate,repeat=False,blit=False,frames=n-1,
                             interval=500)

anim.save('order.gif', dpi=150)

  1. 巡游车空驶率对比(2019年与2020年);
# 2019年数据
paths = glob.glob('../input/taxiOrder20190*.csv')
paths.sort()
for path in paths:
    taxiorder2019 = pd.read_csv(path, nrows=None,
                               dtype = {
                                   'GETON_LONGITUDE': np.float32,
                                   'GETON_LATITUDE': np.float32,
                                   'GETOFF_LONGITUDE': np.float32,
                                   'GETOFF_LATITUDE': np.float32,
                                   'PASS_MILE': np.float16,
                                   'NOPASS_MILE': np.float16,
                                   'WAITING_TIME': np.float16
                               })


    taxiorder2019 = taxiorder2019[['NOPASS_MILE', 'PASS_MILE']].dropna()
    taxiorder2019['NOPASS_Ratio'] = taxiorder2019['NOPASS_MILE'] / (taxiorder2019['NOPASS_MILE'] + taxiorder2019['PASS_MILE'])

    print(path, taxiorder2019['NOPASS_Ratio'].astype(np.float32).mean())

# ../input/taxiOrder20190531.csv 0.27126783
# ../input/taxiOrder20190601.csv 0.27297953
# ../input/taxiOrder20190602.csv 0.30302802
# ../input/taxiOrder20190603.csv 0.31049386
# ../input/taxiOrder20190604.csv 0.3039471
# ../input/taxiOrder20190605.csv 0.2933384
# ../input/taxiOrder20190606.csv 0.2547359
# ../input/taxiOrder20190607.csv 0.28453994
# ../input/taxiOrder20190608.csv 0.304996
# ../input/taxiOrder20190609.csv 0.3115026
# 2020年数据
paths = glob.glob('../input/taxiOrder20200*.csv')
paths.sort()
for path in paths:
    taxiorder2019 = pd.read_csv(path, nrows=None,
                               dtype = {
                                   'GETON_LONGITUDE': np.float32,
                                   'GETON_LATITUDE': np.float32,
                                   'GETOFF_LONGITUDE': np.float32,
                                   'GETOFF_LATITUDE': np.float32,
                                   'PASS_MILE': np.float16,
                                   'NOPASS_MILE': np.float16,
                                   'WAITING_TIME': np.float16
                               })


    taxiorder2019 = taxiorder2019[['NOPASS_MILE', 'PASS_MILE']].dropna()
    taxiorder2019['NOPASS_Ratio'] = taxiorder2019['NOPASS_MILE'] / (taxiorder2019['NOPASS_MILE'] + taxiorder2019['PASS_MILE'])

    print(path, taxiorder2019['NOPASS_Ratio'].astype(np.float32).mean())

# ../input/taxiOrder20200618.csv 0.34004667
# ../input/taxiOrder20200619.csv 0.31731918
# ../input/taxiOrder20200620.csv 0.33150223
# ../input/taxiOrder20200621.csv 0.3449821
# ../input/taxiOrder20200622.csv 0.33434668
# ../input/taxiOrder20200623.csv 0.3306154
# ../input/taxiOrder20200624.csv 0.29195258
# ../input/taxiOrder20200625.csv 0.342389
# ../input/taxiOrder20200626.csv 0.3628601
# ../input/taxiOrder20200627.csv 0.35649845
  1. 网约车订单距离与时间的对比(2019年与2020年);

  1. 网约车空驶率对比(2019年与2020年);
# 2019年
paths = glob.glob('../input/wycOrder2019*.csv')
paths.sort()
for path in paths:
    wycorder2019 = pd.read_csv(path, nrows=None,
                            dtype={
                                'DEP_LONGITUDE': np.float32,
                                'DEP_LATITUDE': np.float32,
                                'DEST_LONGITUDE': np.float32,
                                'DEST_LATITUDE': np.float32,
                            })
    wycorder2019 = wycorder2019.rename(columns={'CAR_NO':'CARNO'})
    wycorder2019['DEP_TIME'] = pd.to_datetime(wycorder2019['DEP_TIME'])

    wycorder2019 = wycorder2019[['WAIT_MILE', 'DRIVE_MILE']]
    wycorder2019['NOPASS_Ratio'] = wycorder2019['WAIT_MILE'] / (wycorder2019['DRIVE_MILE'] + wycorder2019['WAIT_MILE'])

    print(path, wycorder2019['NOPASS_Ratio'].mean())

# ../input/wycOrder20190531.csv 0.04377351902383912
# ../input/wycOrder20190601.csv 0.05089443118832635
# ../input/wycOrder20190602.csv 0.05027405204548952
# ../input/wycOrder20190603.csv 0.04410937618481343
# ../input/wycOrder20190604.csv 0.04556210882875603
# ../input/wycOrder20190605.csv 0.044291802481437374
# ../input/wycOrder20190606.csv 0.050471234377955004
# ../input/wycOrder20190607.csv 0.06292328749500437
# ../input/wycOrder20190608.csv 0.05154802709775605
# ../input/wycOrder20190609.csv 0.05645411864784134
paths = glob.glob('../input/wycOrder2020*.csv')
paths.sort()
for path in paths:
    wycorder2019 = pd.read_csv(path, nrows=None, sep='\\',
                            dtype={
                                'DEP_LONGITUDE': np.float32,
                                'DEP_LATITUDE': np.float32,
                                'DEST_LONGITUDE': np.float32,
                                'DEST_LATITUDE': np.float32,
                            })
    wycorder2019 = wycorder2019.rename(columns={'CAR_NO':'CARNO'})
    wycorder2019['DEP_TIME'] = pd.to_datetime(wycorder2019['DEP_TIME'])

    wycorder2019 = wycorder2019[wycorder2019['DRIVE_MILE'].apply(lambda x: '-' not in str(x) and '|' not in str(x) and 
                                                                   '路' not in str(x))]
    wycorder2019['DRIVE_MILE'] = wycorder2019['DRIVE_MILE'].astype(float)
    wycorder2019['WAIT_MILE'] = wycorder2019['WAIT_MILE'].astype(float)

    wycorder2019 = wycorder2019[['WAIT_MILE', 'DRIVE_MILE']].dropna()
    wycorder2019['NOPASS_Ratio'] = wycorder2019['WAIT_MILE'] / (wycorder2019['DRIVE_MILE'] + wycorder2019['WAIT_MILE'] + 0.1)

    print(path, wycorder2019['NOPASS_Ratio'].mean())

# ../input/wycOrder20200618.csv 0.0366375999557114
# ../input/wycOrder20200619.csv 0.038303237011164316
# ../input/wycOrder20200621.csv 0.049643219579652106
# ../input/wycOrder20200622.csv 0.03599123977888786
# ../input/wycOrder20200623.csv 0.035535909940606306
# ../input/wycOrder20200624.csv 0.04063421181237617
# ../input/wycOrder20200625.csv 0.051779033543772356
# ../input/wycOrder20200626.csv 0.04035187796069988
# ../input/wycOrder20200627.csv 0.047062785762294765

## 学习资源


## 课堂任务

  1. 出租车和网约车的订单价格是由什么决定的?
  2. 为什么网约车空载率比较低?

## 打卡任务

  1. 对比分析2019与2020年端午假期前一天(🤔分别对应哪一天?),巡游车日平均速度变化,上升还是下降?
  2. 对比分析2019年端午假期前、端午假期中和假期后,巡游车日平均速度变化,变化趋势是?


© 2019-2023 coggle.club 版权所有     京ICP备20022947    京公网安备 11030102010643号