使用K-Means算法对用户进行聚类分析,识别不同用户群体。
| user_id | total_amount | freq | avg_session_time |
|---|---|---|---|
| 1001 | 1000 | 10 | 30 |
| 1002 | 500 | 5 | 20 |
| 1003 | 2000 | 15 | 40 |
| 1004 | 300 | 3 | 15 |
| 1005 | 800 | 8 | 25 |
| 1006 | 1500 | 12 | 35 |
| 1007 | 200 | 2 | 10 |
| 1008 | 1200 | 9 | 28 |
| 1009 | 600 | 6 | 22 |
| 1010 | 1800 | 14 | 38 |
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
# 读取数据
data = '''user_id,total_amount,freq,avg_session_time
1001,1000,10,30
1002,500,5,20
1003,2000,15,40
1004,300,3,15
1005,800,8,25
1006,1500,12,35
1007,200,2,10
1008,1200,9,28
1009,600,6,22
1010,1800,14,38
'''
# 转换为DataFrame
from io import StringIO
df = pd.read_csv(StringIO(data))
print("原始数据:")
print(df)
print("\n数据基本信息:")
print(df.info())
print("\n数据描述性统计:")
print(df.describe())
# 准备聚类数据
X = df[['total_amount', 'freq', 'avg_session_time']]
# 数据标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 确定最佳聚类数
print("\n确定最佳聚类数:")
inertia = []
silhouette_scores = []
for k in range(2, 6):
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X_scaled)
inertia.append(kmeans.inertia_)
score = silhouette_score(X_scaled, kmeans.labels_)
silhouette_scores.append(score)
print(f"k={k}, 轮廓系数={score:.4f}")
# 选择最佳k值
best_k = np.argmax(silhouette_scores) + 2
print(f"\n最佳聚类数: {best_k}")
# 使用最佳k值进行聚类
kmeans = KMeans(n_clusters=best_k, random_state=42)
df['cluster'] = kmeans.fit_predict(X_scaled)
print("\n聚类结果:")
print(df)
# 分析每个聚类的特征
print("\n各聚类的特征:")
cluster_analysis = df.groupby('cluster').agg({
'total_amount': 'mean',
'freq': 'mean',
'avg_session_time': 'mean',
'user_id': 'count'
}).rename(columns={'user_id': 'count'})
print(cluster_analysis)
# 为每个聚类命名
def cluster_name(cluster):
if cluster == 0:
return '高价值活跃用户'
elif cluster == 1:
return '中等价值用户'
else:
return '低价值用户'
df['cluster_name'] = df['cluster'].apply(cluster_name)
print("\n带聚类名称的结果:")
print(df[['user_id', 'total_amount', 'freq', 'avg_session_time', 'cluster', 'cluster_name']])
# 计算每个聚类的占比
print("\n各聚类占比:")
cluster_counts = df['cluster_name'].value_counts(normalize=True) * 100
print(cluster_counts)
# 分析每个聚类的特征详情
print("\n各聚类详细分析:")
for cluster in df['cluster'].unique():
cluster_data = df[df['cluster'] == cluster]
print(f"\n聚类 {cluster} ({cluster_data['cluster_name'].iloc[0]}):")
print(f"用户数量: {len(cluster_data)}")
print(f"平均消费金额: {cluster_data['total_amount'].mean():.2f}")
print(f"平均消费频次: {cluster_data['freq'].mean():.2f}")
print(f"平均会话时间: {cluster_data['avg_session_time'].mean():.2f}")