- #1.1 Балансировка целевого признака
- #Спобос уменьшения размера весов
- dict_classes={0:1, 1:class_ratio}
- classificator = LogisticRegression(class_weight=dict_classes)
- f1_ballanced = cross_val_score(classificator,
- features_train,
- target_train,
- cv=cv_counts,
- scoring='f1').mean()
- print('F1 на CV с балансированными классами', f1_ballanced)
- classificator = LogisticRegression(class_weight='balanced')
- f1_ballanced = cross_val_score(classificator,
- features_train,
- target_train,
- cv=cv_counts,
- scoring='f1').mean()
- print('F1 на CV с балансированными классами', f1_ballanced)
- #Спобос ресемплинг с уменшением класса 0
- df_train = df.iloc[target_train.index]
- target_train_class_zero = df_train[df_train['toxic'] == 0]['toxic']
- target_train_class_one = df_train[df_train['toxic'] == 1]['toxic']
- target_train_class_zero_downsample = target_train_class_zero.sample(target_train_class_one.shape[0],
- random_state=12082020)
- target_train_downsample = pd.concat([target_train_class_zero_downsample, target_train_class_one])
- features_train_downsample = df.iloc[target_train_downsample.index]
- features_train_downsample, target_train_downsample = shuffle(features_train_downsample,
- target_train_downsample,
- random_state=12082020)
- features_train_downsample = count_tf_idf.transform(features_train_downsample['lemm_text']
- .values)
- classificator = LogisticRegression()
- train_f1_downsampled = cross_val_score(classificator,
- features_train_downsample,
- target_train_downsample,
- cv=cv_counts,
- scoring='f1').mean()
- print('F1 на CV с уменьшением классов', train_f1_downsampled)