Untitled

From Trivial Pintail, 1 Year ago, written in Plain Text, viewed 183 times.
URL http://codebin.org/view/82190504 Embed
Download Paste or View Raw
  1. import pandas as pd
  2.  
  3. data = pd.read_csv('/datasets/visits.csv', sep='\t')
  4.  
  5. # фильтруем слишком быстрые и медленные заезды и АЗС
  6. data['too_fast'] = data['time_spent'] < 60
  7. data['too_slow'] = data['time_spent'] > 1000
  8. too_fast_stat = data.pivot_table(index='id', values='too_fast')
  9. good_ids = too_fast_stat.query('too_fast < 0.5')
  10. good_data = data.query('id in @good_ids.index')
  11. good_data = good_data.query('60 <= time_spent <= 1000')
  12.  
  13. # считаем данные по отдельным АЗС и по сетям
  14. station_stat = data.pivot_table(index='id', values='time_spent', aggfunc='median')
  15. good_stations_stat = good_data.pivot_table(index='id', values='time_spent', aggfunc='median')
  16. stat = data.pivot_table(index='name', values='time_spent')
  17. good_stat = good_data.pivot_table(index='name', values='time_spent', aggfunc='median')
  18. stat['good_time_spent'] = good_stat['time_spent']
  19.  
  20. id_name = good_data.pivot_table(index='id', values='name', aggfunc=['first', 'count'])
  21. id_name.columns = ['name', 'count']
  22. station_stat_full = id_name.join(good_stations_stat)
  23.  
  24. # считаем показатели сетей из показателей АЗС,
  25. # а не усреднённые заезды на все АЗС сети
  26. good_stat2 = (
  27.     station_stat_full
  28.     .query('count > 30')
  29.     .pivot_table(index='name', values='time_spent', aggfunc=['median', 'count'])
  30. )
  31. good_stat2.columns = ['median_time', 'stations']
  32. final_stat = stat.join(good_stat2)
  33.  
  34. big_nets_stat = final_stat.query('stations > 10')
  35. station_stat_full['group_name'] = (
  36.     station_stat_full['name']
  37.     .where(station_stat_full['name'].isin(big_nets_stat.index), 'Другие')
  38. )
  39.  
  40. stat_grouped = (
  41.     station_stat_full
  42.     .query('count > 30')
  43.     .pivot_table(index='group_name', values='time_spent', aggfunc=['median', 'count'])
  44. )
  45. stat_grouped.columns = ['time_spent', 'count']
  46.  
  47. good_data['group_name'] = (
  48.     good_data['name']
  49.     .where(good_data['name'].isin(big_nets_stat.index), 'Другие')
  50. )
  51. for name, group_data in good_data.groupby('group_name'):
  52.     group_data.hist(['time_spent'], bins=50)

Reply to "Untitled"

Here you can reply to the paste above