- # сводная таблица
- group = (logs.
- groupby(['event_name', 'group_id']).
- agg({'user_id': 'nunique'}).
- reset_index().
- rename(columns={'user_id' : 'total_users'}).
- sort_values(by=['group_id','total_users'], ascending=False))
- group =group[group['event_name'] != 'Tutorial']
- group
- # функция
- def hyp (df1, df2, event, alpha, n):
- bonf_alpha = alpha / n
- n_users = np.array([df1['user_id'].nunique(),
- df2['user_id'].nunique()])
- success = np.array([df1[df1['event_name'] == event_name]['user_id'].nunique(),
- df2[df2['event_name'] == event_name]['user_id'].nunique()])
- p1 = success[0]/n_users[0]
- p2 = success[1]/n_users[1]
- p_combined = (success[0] + success[1]) / (n_users[0] + n_users[1])
- difference = p1 - p2
- z_value = difference / np.sqrt(p_combined * (1 - p_combined) * (1/n_users[0] + 1/n_users[1]))
- distr = st.norm(0, 1)
- p_value = (1 - distr.cdf(abs(z_value))) * 2
- print('Событие:', event)
- print('p-значение: ', p_value)
- if p_value < bonf_alpha:
- print('Отвергаем нулевую гипотезу: между долями есть разница')
- else:
- print(
- 'Не получилось отвергнуть нулевую гипотезу, нет оснований считать доли разными')
- # Потом передаешь каждую группу функции:
- for event_name in group['event_name'].unique():
- hyp(logs[logs['group_id'] == 246], logs[logs['group_id'] == 247], event_name,.05, 4)
- print()
- for event_name in group['event_name'].unique():
- hyp(logs[logs['group_id'] == 246], logs[logs['group_id'] == 248], event_name,.05, 12)
- print()
- for event_name in group['event_name'].unique():
- hyp(logs[logs['group_id'] == 247], logs[logs['group_id'] == 248], event_name,.05,12)
- print()
- for event_name in group['event_name'].unique():
- hyp(logs[logs['group_id'] != 248], logs[logs['group_id'] == 248], event_name,.05, 12)
- print()