Тема 3 Урок 7 Задача 3

From Андрей, 2 Months ago, written in Python, viewed 305 times.
URL http://codebin.org/view/2fc93cce Embed
Download Paste or View Raw
  1. import pandas as pd
  2. import numpy as np
  3. from datetime import datetime, timedelta
  4. from matplotlib import pyplot as plt
  5.  
  6.  
  7. # функция для получения пользовательских профилей
  8. def get_profiles(sessions, orders, events, ad_costs, event_names=[]):
  9.     profiles = (
  10.         sessions.sort_values(by=['user_id', 'session_start'])
  11.         .groupby('user_id')
  12.         .agg(
  13.             {
  14.                 'session_start': 'first',
  15.                 'channel': 'first',
  16.                 'device': 'first',
  17.                 'region': 'first',
  18.             }
  19.         )
  20.         .rename(columns={'session_start': 'first_ts'})
  21.         .reset_index()
  22.     )
  23.     profiles['dt'] = profiles['first_ts'].dt.date
  24.     profiles['month'] = profiles['first_ts'].astype('datetime64[M]')
  25.     profiles['payer'] = profiles['user_id'].isin(orders['user_id'].unique())
  26.  
  27.     for event in event_names:
  28.         if event in events['event_name'].unique():
  29.             profiles[event] = profiles['user_id'].isin(
  30.                 events.query('event_name == @event')['user_id'].unique()
  31.             )
  32.     new_users = (
  33.         profiles.groupby(['dt', 'channel'])
  34.         .agg({'user_id': 'nunique'})
  35.         .rename(columns={'user_id': 'unique_users'})
  36.         .reset_index()
  37.     )
  38.     ad_costs = ad_costs.merge(new_users, on=['dt', 'channel'], how='left')
  39.     ad_costs['acquisition_cost'] = ad_costs['costs'] / ad_costs['unique_users']
  40.     profiles = profiles.merge(
  41.         ad_costs[['dt', 'channel', 'acquisition_cost']],
  42.         on=['dt', 'channel'],
  43.         how='left',
  44.     )
  45.     profiles['acquisition_cost'] = profiles['acquisition_cost'].fillna(0)
  46.  
  47.     return profiles
  48.  
  49.  
  50. # функция для расчёта LTV и ROI
  51. def get_ltv(
  52.     profiles,
  53.     purchases,
  54.     observation_date,
  55.     horizon_days,
  56.     dimensions=[],
  57.     ignore_horizon=False,
  58. ):
  59.  
  60.     last_suitable_acquisition_date = observation_date
  61.     if not ignore_horizon:
  62.         last_suitable_acquisition_date = observation_date - timedelta(
  63.             days=horizon_days - 1
  64.         )
  65.     result_raw = profiles.query('dt <= @last_suitable_acquisition_date')
  66.     result_raw = result_raw.merge(
  67.         purchases[['user_id', 'event_dt', 'revenue']], on='user_id', how='left'
  68.     )
  69.     result_raw['lifetime'] = (
  70.         result_raw['event_dt'] - result_raw['first_ts']
  71.     ).dt.days
  72.  
  73.     if len(dimensions) == 0:
  74.         result_raw['cohort'] = 'All users'
  75.         dimensions = dimensions + ['cohort']
  76.  
  77.     def group_by_dimensions(df, dims, horizon_days):
  78.         result = df.pivot_table(
  79.             index=dims, columns='lifetime', values='revenue', aggfunc='sum'
  80.         )
  81.         result = result.fillna(0).cumsum(axis=1)
  82.         cohort_sizes = (
  83.             df.groupby(dims)
  84.             .agg({'user_id': 'nunique'})
  85.             .rename(columns={'user_id': 'cohort_size'})
  86.         )
  87.         result = cohort_sizes.merge(result, on=dims, how='left').fillna(0)
  88.         result = result.div(result['cohort_size'], axis=0)
  89.         result = result[['cohort_size'] + list(range(horizon_days))]
  90.         result['cohort_size'] = cohort_sizes
  91.         cac = df[['user_id', 'acquisition_cost'] + dims].drop_duplicates()
  92.         cac = (
  93.             cac.groupby(dims)
  94.             .agg({'acquisition_cost': 'mean'})
  95.             .rename(columns={'acquisition_cost': 'cac'})
  96.         )
  97.         roi = result.div(cac['cac'], axis=0)
  98.         roi = roi[~roi['cohort_size'].isin([np.inf])]
  99.         roi['cohort_size'] = cohort_sizes
  100.         roi['cac'] = cac['cac']
  101.         roi = roi[['cohort_size', 'cac'] + list(range(horizon_days))]
  102.         return result, roi
  103.  
  104.     result_grouped, roi_grouped = group_by_dimensions(
  105.         result_raw, dimensions, horizon_days
  106.     )
  107.     if 'cohort' in dimensions:
  108.         dimensions = []
  109.     result_in_time, roi_in_time = group_by_dimensions(
  110.         result_raw, dimensions + ['dt'], horizon_days
  111.     )
  112.  
  113.     return (
  114.         result_raw,  # сырые данные
  115.         result_grouped,  # таблица LTV
  116.         result_in_time,  # таблица динамики LTV
  117.         roi_grouped,  # таблица ROI
  118.         roi_in_time,  # таблица динамики ROI
  119.     )
  120.  
  121.  
  122. visits = pd.read_csv('ch02_problems03_visits.csv')  # журнал сессий
  123. purchases = pd.read_csv('ch02_problems02_orders.csv')  # покупки
  124. ad_costs = pd.read_csv('ch02_problems02_costs.csv')  # траты на рекламу
  125. events = None  # других событий нет
  126.  
  127. # преобразование данных о времени
  128. visits['session_start'] = pd.to_datetime(visits['session_start'])
  129. purchases['event_dt'] = pd.to_datetime(purchases['event_dt'])
  130.  
  131. # разбиваем траты на рекламу по дням
  132. min_date = datetime(2020, 7, 11)
  133. max_date = datetime(2020, 8, 10)
  134. days_num = (max_date - min_date).days
  135. daily_costs = []
  136. for index, values in ad_costs.iterrows():
  137.     channel_name = values['channel']
  138.     channel_costs = values['costs']
  139.     avg_channel_costs = channel_costs / days_num
  140.     for i in range(days_num):
  141.         current_day = min_date + timedelta(days=i)
  142.         daily_costs += [[current_day, channel_name, avg_channel_costs]]
  143. daily_costs = pd.DataFrame(daily_costs, columns=['dt', 'channel', 'costs'])
  144. daily_costs['dt'] = daily_costs['dt'].dt.date
  145.  
  146. users = get_profiles(visits, purchases, events, daily_costs)  # профили с CAC
  147.  
  148. # получите сырые данные, таблицы LTV и ROI, а также таблицы динамики LTV и ROI
  149. ltv_raw, ltv, ltv_history, roi, roi_history = get_ltv(
  150.    users, purchases, datetime(2020, 8, 10).date(), 7, dimensions=['channel']
  151. )
  152.  
  153. # исключите размеры когорт и CAC из таблицы ROI
  154. report = roi.drop(columns=['cohort_size'])
  155. report = roi.drop(columns=['cac'])
  156. # строим кривые ROI
  157. report.T.plot(grid=True, figsize=(10, 5), xticks=list(report.columns.values))
  158. plt.title('ROI с разбивкой по каналам привлечения')
  159. plt.ylabel('ROI')
  160. plt.xlabel('Лайфтайм')
  161. plt.axhline(y=1, color='red', linestyle='--', label='Уровень окупаемости')
  162. plt.legend()
  163. plt.show()

Replies to Тема 3 Урок 7 Задача 3 rss

Title Name Language When
Re: Тема 3 Урок 7 Задача 3 Coral Peafowl python 17 Hours ago.

Reply to "Тема 3 Урок 7 Задача 3 "

Here you can reply to the paste above