项目6:用户聚类分析(K-Means)

使用K-Means算法对用户进行聚类分析,识别不同用户群体。

数据预览

user_id total_amount freq avg_session_time
100110001030
1002500520
100320001540
1004300315
1005800825
100615001235
1007200210
10081200928
1009600622
101018001438

代码编辑器

参考答案

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

# 读取数据
data = '''user_id,total_amount,freq,avg_session_time
1001,1000,10,30
1002,500,5,20
1003,2000,15,40
1004,300,3,15
1005,800,8,25
1006,1500,12,35
1007,200,2,10
1008,1200,9,28
1009,600,6,22
1010,1800,14,38
'''

# 转换为DataFrame
from io import StringIO
df = pd.read_csv(StringIO(data))

print("原始数据:")
print(df)
print("\n数据基本信息:")
print(df.info())
print("\n数据描述性统计:")
print(df.describe())

# 准备聚类数据
X = df[['total_amount', 'freq', 'avg_session_time']]

# 数据标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 确定最佳聚类数
print("\n确定最佳聚类数:")
inertia = []
silhouette_scores = []
for k in range(2, 6):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)
    score = silhouette_score(X_scaled, kmeans.labels_)
    silhouette_scores.append(score)
    print(f"k={k}, 轮廓系数={score:.4f}")

# 选择最佳k值
best_k = np.argmax(silhouette_scores) + 2
print(f"\n最佳聚类数: {best_k}")

# 使用最佳k值进行聚类
kmeans = KMeans(n_clusters=best_k, random_state=42)
df['cluster'] = kmeans.fit_predict(X_scaled)

print("\n聚类结果:")
print(df)

# 分析每个聚类的特征
print("\n各聚类的特征:")
cluster_analysis = df.groupby('cluster').agg({
    'total_amount': 'mean',
    'freq': 'mean',
    'avg_session_time': 'mean',
    'user_id': 'count'
}).rename(columns={'user_id': 'count'})
print(cluster_analysis)

# 为每个聚类命名
def cluster_name(cluster):
    if cluster == 0:
        return '高价值活跃用户'
    elif cluster == 1:
        return '中等价值用户'
    else:
        return '低价值用户'

df['cluster_name'] = df['cluster'].apply(cluster_name)
print("\n带聚类名称的结果:")
print(df[['user_id', 'total_amount', 'freq', 'avg_session_time', 'cluster', 'cluster_name']])

# 计算每个聚类的占比
print("\n各聚类占比:")
cluster_counts = df['cluster_name'].value_counts(normalize=True) * 100
print(cluster_counts)

# 分析每个聚类的特征详情
print("\n各聚类详细分析:")
for cluster in df['cluster'].unique():
    cluster_data = df[df['cluster'] == cluster]
    print(f"\n聚类 {cluster} ({cluster_data['cluster_name'].iloc[0]}):")
    print(f"用户数量: {len(cluster_data)}")
    print(f"平均消费金额: {cluster_data['total_amount'].mean():.2f}")
    print(f"平均消费频次: {cluster_data['freq'].mean():.2f}")
    print(f"平均会话时间: {cluster_data['avg_session_time'].mean():.2f}")
返回主页