六、日期时间预处理

作者:Chris Albon

译者:飞龙

协议:CC BY-NC-SA 4.0

把日期和时间拆成多个特征

  1. # 加载库
  2. import pandas as pd
  3. # 创建数据帧
  4. df = pd.DataFrame()
  5. # 创建五个日期
  6. df['date'] = pd.date_range('1/1/2001', periods=150, freq='W')
  7. # 为年月日,时分秒创建特征
  8. df['year'] = df['date'].dt.year
  9. df['month'] = df['date'].dt.month
  10. df['day'] = df['date'].dt.day
  11. df['hour'] = df['date'].dt.hour
  12. df['minute'] = df['date'].dt.minute
  13. # 展示三行
  14. df.head(3)
dateyearmonthdayhourminute
02001-01-0720011700
12001-01-14200111400
22001-01-21200112100

计算日期时间之间的差

  1. # 加载库
  2. import pandas as pd
  3. # 创建数据帧
  4. df = pd.DataFrame()
  5. # 创建两个 datetime 特征
  6. df['Arrived'] = [pd.Timestamp('01-01-2017'), pd.Timestamp('01-04-2017')]
  7. df['Left'] = [pd.Timestamp('01-01-2017'), pd.Timestamp('01-06-2017')]
  8. # 计算特征之间的间隔
  9. df['Left'] - df['Arrived']
  10. '''
  11. 0 0 days
  12. 1 2 days
  13. dtype: timedelta64[ns]
  14. '''
  15. # 计算特征之间的间隔
  16. pd.Series(delta.days for delta in (df['Left'] - df['Arrived']))
  17. '''
  18. 0 0
  19. 1 2
  20. dtype: int64
  21. '''

将字符串转换为日期

  1. # 加载库
  2. import numpy as np
  3. import pandas as pd
  4. # 创建字符串
  5. date_strings = np.array(['03-04-2005 11:35 PM',
  6. '23-05-2010 12:01 AM',
  7. '04-09-2009 09:09 PM'])

如果errors="coerce"那么任何问题都不会产生错误(默认行为),而是将导致错误的值设置为NaT(即缺失值)。

代码描述示例
%Y整年2001
%m零填充的月份04
%d零填充的日期09
%I零填充的小时(12 小时)02
%pAM 或 PMAM
%M零填充的分钟05
%S零填充的秒钟09
  1. # 转换为 datetime
  2. [pd.to_datetime(date, format="%d-%m-%Y %I:%M %p", errors="coerce") for date in date_strings]
  3. '''
  4. [Timestamp('2005-04-03 23:35:00'),
  5. Timestamp('2010-05-23 00:01:00'),
  6. Timestamp('2009-09-04 21:09:00')]
  7. '''

转换 pandas 列的时区

  1. # 加载库
  2. import pandas as pd
  3. from pytz import all_timezones
  4. # 展示十个时区
  5. all_timezones[0:10]
  6. '''
  7. ['Africa/Abidjan',
  8. 'Africa/Accra',
  9. 'Africa/Addis_Ababa',
  10. 'Africa/Algiers',
  11. 'Africa/Asmara',
  12. 'Africa/Asmera',
  13. 'Africa/Bamako',
  14. 'Africa/Bangui',
  15. 'Africa/Banjul',
  16. 'Africa/Bissau']
  17. '''
  18. # 创建十个日期
  19. dates = pd.Series(pd.date_range('2/2/2002', periods=10, freq='M'))
  20. # 设置时区
  21. dates_with_abidjan_time_zone = dates.dt.tz_localize('Africa/Abidjan')
  22. # 查看 pandas 序列
  23. dates_with_abidjan_time_zone
  24. '''
  25. 0 2002-02-28 00:00:00+00:00
  26. 1 2002-03-31 00:00:00+00:00
  27. 2 2002-04-30 00:00:00+00:00
  28. 3 2002-05-31 00:00:00+00:00
  29. 4 2002-06-30 00:00:00+00:00
  30. 5 2002-07-31 00:00:00+00:00
  31. 6 2002-08-31 00:00:00+00:00
  32. 7 2002-09-30 00:00:00+00:00
  33. 8 2002-10-31 00:00:00+00:00
  34. 9 2002-11-30 00:00:00+00:00
  35. dtype: datetime64[ns, Africa/Abidjan]
  36. '''
  37. # 转换时区
  38. dates_with_london_time_zone = dates_with_abidjan_time_zone.dt.tz_convert('Europe/London')
  39. # 查看 pandas 序列
  40. dates_with_london_time_zone
  41. '''
  42. 0 2002-02-28 00:00:00+00:00
  43. 1 2002-03-31 00:00:00+00:00
  44. 2 2002-04-30 01:00:00+01:00
  45. 3 2002-05-31 01:00:00+01:00
  46. 4 2002-06-30 01:00:00+01:00
  47. 5 2002-07-31 01:00:00+01:00
  48. 6 2002-08-31 01:00:00+01:00
  49. 7 2002-09-30 01:00:00+01:00
  50. 8 2002-10-31 00:00:00+00:00
  51. 9 2002-11-30 00:00:00+00:00
  52. dtype: datetime64[ns, Europe/London]
  53. '''

编码星期

  1. # 加载库
  2. import pandas as pd
  3. # 创建数据集
  4. dates = pd.Series(pd.date_range('2/2/2002', periods=3, freq='M'))
  5. # 查看数据
  6. dates
  7. '''
  8. 0 2002-02-28
  9. 1 2002-03-31
  10. 2 2002-04-30
  11. dtype: datetime64[ns]
  12. '''
  13. # 查看星期
  14. dates.dt.weekday_name
  15. '''
  16. 0 Thursday
  17. 1 Sunday
  18. 2 Tuesday
  19. dtype: object
  20. '''

处理时间序列中的缺失值

  1. # 加载库
  2. import pandas as pd
  3. import numpy as np
  4. # 创建日期
  5. time_index = pd.date_range('01/01/2010', periods=5, freq='M')
  6. # 创建数据帧,设置索引
  7. df = pd.DataFrame(index=time_index)
  8. # 创建带有一些缺失值的特征
  9. df['Sales'] = [1.0,2.0,np.nan,np.nan,5.0]
  10. # 对缺失值执行插值
  11. df.interpolate()
Sales
2010-01-311.0
2010-02-282.0
2010-03-313.0
2010-04-304.0
2010-05-315.0
  1. # 前向填充
  2. df.ffill()
Sales
2010-01-311.0
2010-02-282.0
2010-03-312.0
2010-04-302.0
2010-05-315.0
  1. # 后向填充
  2. df.bfill()
Sales
2010-01-311.0
2010-02-282.0
2010-03-315.0
2010-04-305.0
2010-05-315.0
  1. # 对缺失值执行插值
  2. df.interpolate(limit=1, limit_direction='forward')
Sales
2010-01-311.0
2010-02-282.0
2010-03-313.0
2010-04-30NaN
2010-05-315.0

处理时区

  1. # 加载库
  2. import pandas as pd
  3. from pytz import all_timezones
  4. # 展示十个时区
  5. all_timezones[0:10]
  6. '''
  7. ['Africa/Abidjan',
  8. 'Africa/Accra',
  9. 'Africa/Addis_Ababa',
  10. 'Africa/Algiers',
  11. 'Africa/Asmara',
  12. 'Africa/Asmera',
  13. 'Africa/Bamako',
  14. 'Africa/Bangui',
  15. 'Africa/Banjul',
  16. 'Africa/Bissau']
  17. '''
  18. # 创建 datetime
  19. pd.Timestamp('2017-05-01 06:00:00', tz='Europe/London')
  20. # Timestamp('2017-05-01 06:00:00+0100', tz='Europe/London')
  21. # 创建 datetime
  22. date = pd.Timestamp('2017-05-01 06:00:00')
  23. # 设置时区
  24. date_in_london = date.tz_localize('Europe/London')
  25. # 修改时区
  26. date_in_london.tz_convert('Africa/Abidjan')
  27. # Timestamp('2017-05-01 05:00:00+0000', tz='Africa/Abidjan')

平移时间特征

  1. # 加载库
  2. import pandas as pd
  3. # 创建数据帧
  4. df = pd.DataFrame()
  5. # 创建数据
  6. df['dates'] = pd.date_range('1/1/2001', periods=5, freq='D')
  7. df['stock_price'] = [1.1,2.2,3.3,4.4,5.5]
  8. # 将值平移一行
  9. df['previous_days_stock_price'] = df['stock_price'].shift(1)
  10. # 展示数据帧
  11. df
datesstock_priceprevious_days_stock_price
02001-01-011.1NaN
12001-01-022.21.1
22001-01-033.32.2
32001-01-044.43.3
42001-01-055.54.4

滑动时间窗口

  1. # 加载库
  2. import pandas as pd
  3. # 创建 datetime
  4. time_index = pd.date_range('01/01/2010', periods=5, freq='M')
  5. # 创建数据帧,设置索引
  6. df = pd.DataFrame(index=time_index)
  7. # 创建特征
  8. df['Stock_Price'] = [1,2,3,4,5]
  9. # 计算滑动均值
  10. df.rolling(window=2).mean()
Stock_Price
2010-01-31NaN
2010-02-281.5
2010-03-312.5
2010-04-303.5
2010-05-314.5
  1. # 识别滑动时间窗口中的最大值
  2. df.rolling(window=2).max()
Stock_Price
2010-01-31NaN
2010-02-282.0
2010-03-313.0
2010-04-304.0
2010-05-315.0

选择日期时间范围

  1. # 加载库
  2. import pandas as pd
  3. # 创建数据帧
  4. df = pd.DataFrame()
  5. # 创建 datetime
  6. df['date'] = pd.date_range('1/1/2001', periods=100000, freq='H')

如果数据帧未按时间索引,请使用此方法。

  1. # 选择两个日期时间之间的观测
  2. df[(df['date'] > '2002-1-1 01:00:00') & (df['date'] <= '2002-1-1 04:00:00')]
date
87622002-01-01 02:00:00
87632002-01-01 03:00:00
87642002-01-01 04:00:00

如果数据帧按时间索引,请使用此方法。

  1. # 设置索引
  2. df = df.set_index(df['date'])
  3. # 选择两个日期时间之间的观测
  4. df.loc['2002-1-1 01:00:00':'2002-1-1 04:00:00']
date
date
2002-01-01 01:00:002002-01-01 01:00:00
2002-01-01 02:00:002002-01-01 02:00:00
2002-01-01 03:00:002002-01-01 03:00:00
2002-01-01 04:00:002002-01-01 04:00:00