- import pandas as pd
- data = pd.read_csv('/datasets/visits.csv', sep='\t')
- # фильтруем слишком быстрые и медленные заезды и АЗС
- data['too_fast'] = data['time_spent'] < 60
- data['too_slow'] = data['time_spent'] > 1000
- too_fast_stat = data.pivot_table(index='id', values='too_fast')
- good_ids = too_fast_stat.query('too_fast < 0.5')
- good_data = data.query('id in @good_ids.index')
- good_data = good_data.query('60 <= time_spent <= 1000')
- # считаем данные по отдельным АЗС и по сетям
- station_stat = data.pivot_table(index='id', values='time_spent', aggfunc='median')
- good_stations_stat = good_data.pivot_table(index='id', values='time_spent', aggfunc='median')
- stat = data.pivot_table(index='name', values='time_spent')
- good_stat = good_data.pivot_table(index='name', values='time_spent', aggfunc='median')
- stat['good_time_spent'] = good_stat['time_spent']
- id_name = good_data.pivot_table(index='id', values='name', aggfunc=['first', 'count'])
- id_name.columns = ['name', 'count']
- station_stat_full = id_name.join(good_stations_stat)
- # считаем показатели сетей из показателей АЗС,
- # а не усреднённые заезды на все АЗС сети
- good_stat2 = (
- station_stat_full
- .query('count > 30')
- .pivot_table(index='name', values='time_spent', aggfunc=['median', 'count'])
- )
- good_stat2.columns = ['median_time', 'stations']
- final_stat = stat.join(good_stat2)
- final_stat.sort_values()
- final_stat.plot(x='good_time_spent', y='median_time', kind='bar', figsize=(10, 5))
Re: Re: Untitled
From Bistre Camel, 4 Months ago, written in Plain Text, viewed 66 times.
This paste is a reply to Re: Untitled from Emerald Crane
- view diff
URL http://codebin.org/view/50b75d73
Embed
Download Paste or View Raw
— Expand Paste to full width of browser