# Tworzebue modelu do klastrowania danych

In [1]:
import pandas as pd
from pycaret.clustering import setup, create_model, plot_model, assign_model, save_model, load_model, predict_model
#biblioteki do znalezienia optymalnej liczby klastrów
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import silhouette_score

In [2]:
df = pd.read_csv('welcome_survey_simple_v1.csv', sep=';')
df.head()

Unnamed: 0,age,edu_level,fav_animals,fav_place,gender
0,<18,Podstawowe,Brak ulubionych,,Kobieta
1,25-34,Średnie,Psy,Nad wodą,Mężczyzna
2,45-54,Wyższe,Psy,W lesie,Mężczyzna
3,35-44,Średnie,Koty,W górach,Mężczyzna
4,35-44,Wyższe,Psy,Nad wodą,Mężczyzna


In [3]:
len(df)

140

In [4]:
s = setup(df, session_id=123)
s

Unnamed: 0,Description,Value
0,Session id,123
1,Original data shape,"(140, 5)"
2,Transformed data shape,"(140, 21)"
3,Categorical features,5
4,Rows with missing values,11.4%
5,Preprocess,True
6,Imputation type,simple
7,Numeric imputation,mean
8,Categorical imputation,mode
9,Maximum one-hot encoding,-1


<pycaret.clustering.oop.ClusteringExperiment at 0x19a0b44aad0>

In [5]:
s.dataset.head()

Unnamed: 0,age,edu_level,fav_animals,fav_place,gender
0,<18,Podstawowe,Brak ulubionych,,Kobieta
1,25-34,Średnie,Psy,Nad wodą,Mężczyzna
2,45-54,Wyższe,Psy,W lesie,Mężczyzna
3,35-44,Średnie,Koty,W górach,Mężczyzna
4,35-44,Wyższe,Psy,Nad wodą,Mężczyzna


In [6]:
s.dataset_transformed.head()

Unnamed: 0,age_<18,age_25-34,age_45-54,age_35-44,age_18-24,age_>=65,age_55-64,age_unknown,edu_level_Podstawowe,edu_level_Średnie,...,fav_animals_Brak ulubionych,fav_animals_Psy,fav_animals_Koty,fav_animals_Inne,fav_animals_Koty i Psy,fav_place_Nad wodą,fav_place_W lesie,fav_place_W górach,fav_place_Inne,gender
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [7]:
kmeans = create_model('kmeans', num_clusters=8)

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2031,16.8664,1.668,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
df_with_clusters = assign_model(kmeans)
df_with_clusters

Unnamed: 0,age,edu_level,fav_animals,fav_place,gender,Cluster
0,<18,Podstawowe,Brak ulubionych,,Kobieta,Cluster 7
1,25-34,Średnie,Psy,Nad wodą,Mężczyzna,Cluster 2
2,45-54,Wyższe,Psy,W lesie,Mężczyzna,Cluster 0
3,35-44,Średnie,Koty,W górach,Mężczyzna,Cluster 2
4,35-44,Wyższe,Psy,Nad wodą,Mężczyzna,Cluster 1
...,...,...,...,...,...,...
135,35-44,Wyższe,Koty,W górach,Mężczyzna,Cluster 3
136,35-44,Wyższe,Psy,W górach,Mężczyzna,Cluster 3
137,45-54,Wyższe,Psy,W lesie,Kobieta,Cluster 0
138,35-44,Wyższe,Psy,Nad wodą,Kobieta,Cluster 1


In [9]:
df_with_clusters["Cluster"].value_counts()

Cluster
Cluster 1    33
Cluster 2    20
Cluster 3    17
Cluster 5    17
Cluster 0    16
Cluster 4    15
Cluster 7    11
Cluster 6    11
Name: count, dtype: int64

In [10]:
plot_model(kmeans, plot='cluster')

In [11]:
save_model(kmeans, 'welcome_survey_clustering_pipeline_v1', verbose=False)

(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=[], transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=['age', 'edu_level', 'fav_animals',
                                              'fav_place', 'gender'],
                                     transformer=SimpleImputer(strategy='most_frequent'))),
                 ('ordinal_encoding',
                  TransformerWrapper(include=['gender'],
                                     transfo...
                                                                mapping=[{'col': 'gender',
                                                                          'data_type': dtype('O'),
                                                                          'mapping': Kobieta      0
 Mężczyzna    1
 NaN         -1
 dtype: int64}]))),
                 ('onehot_encoding',
                  TransformerWrappe

In [12]:
kmeans_pipeline = load_model('welcome_survey_clustering_pipeline_v1')
kmeans_pipeline

Transformation Pipeline and Model Successfully Loaded


In [13]:
predict_df = pd.DataFrame([
    {
        "age": "45-54",  # '<18', '25-34', '45-54', '35-44', '18-24', '>=65', '55-64', 'unknown'
        "edu_level": 'Średnie',  # 'Podstawowe', 'Średnie', 'Wyższe'
        "fav_animals": 'Brak ulubionych',  # 'Brak ulubionych', 'Psy', 'Koty', 'Inne', 'Koty i Psy'
        "fav_place": 'W lesie',  # 'Nad wodą', 'W lesie', 'W górach', 'Inne'
        "gender": 'Kobieta',  # 'Mężczyzna', 'Kobieta'
    }
])

In [14]:
predict_with_clusters_df = predict_model(kmeans_pipeline, data=predict_df)
predict_with_clusters_df["Cluster"]

0    Cluster 6
Name: Cluster, dtype: object