分析用户流失的特征,构建流失预警模型。
| user_id | reg_date | order_date | amount | last_login |
|---|---|---|---|---|
| 1001 | 2023-01-01 | 2023-01-05 | 1000 | 2023-01-10 |
| 1002 | 2023-01-02 | 2023-01-03 | 500 | 2023-01-03 |
| 1003 | 2023-01-03 | 2023-01-06 | 1500 | 2023-01-15 |
| 1004 | 2023-01-04 | 2023-01-04 | 300 | 2023-01-04 |
| 1005 | 2023-01-05 | 2023-01-07 | 800 | 2023-01-12 |
| 1006 | 2023-01-06 | 2023-01-06 | 1200 | 2023-01-06 |
| 1007 | 2023-01-07 | 2023-01-08 | 600 | 2023-01-14 |
| 1008 | 2023-01-08 | 2023-01-08 | 400 | 2023-01-08 |
| 1009 | 2023-01-09 | 2023-01-10 | 900 | 2023-01-16 |
| 1010 | 2023-01-10 | 2023-01-10 | 700 | 2023-01-10 |
import pandas as pd
import numpy as np
# 读取数据
data = '''user_id,reg_date,order_date,amount,last_login
1001,2023-01-01,2023-01-05,1000,2023-01-10
1002,2023-01-02,2023-01-03,500,2023-01-03
1003,2023-01-03,2023-01-06,1500,2023-01-15
1004,2023-01-04,2023-01-04,300,2023-01-04
1005,2023-01-05,2023-01-07,800,2023-01-12
1006,2023-01-06,2023-01-06,1200,2023-01-06
1007,2023-01-07,2023-01-08,600,2023-01-14
1008,2023-01-08,2023-01-08,400,2023-01-08
1009,2023-01-09,2023-01-10,900,2023-01-16
1010,2023-01-10,2023-01-10,700,2023-01-10
'''
# 转换为DataFrame
from io import StringIO
df = pd.read_csv(StringIO(data))
# 转换日期列
df['reg_date'] = pd.to_datetime(df['reg_date'])
df['order_date'] = pd.to_datetime(df['order_date'])
df['last_login'] = pd.to_datetime(df['last_login'])
print("原始数据:")
print(df)
print("\n数据基本信息:")
print(df.info())
print("\n数据描述性统计:")
print(df.describe())
# 计算用户活跃天数
df['active_days'] = (df['last_login'] - df['reg_date']).dt.days
print("\n添加活跃天数后的数据:")
print(df)
# 计算用户购买延迟
df['purchase_delay'] = (df['order_date'] - df['reg_date']).dt.days
print("\n添加购买延迟后的数据:")
print(df)
# 计算用户最近一次登录到当前的天数(假设当前日期为2023-01-20)
current_date = pd.to_datetime('2023-01-20')
df['days_since_last_login'] = (current_date - df['last_login']).dt.days
print("\n添加最近登录天数后的数据:")
print(df)
# 定义流失用户(假设超过7天未登录为流失)
df['is_churn'] = df['days_since_last_login'] > 7
print("\n添加流失标签后的数据:")
print(df)
# 分析流失用户特征
print("\n流失用户与非流失用户对比:")
churn_analysis = df.groupby('is_churn').agg({
'amount': 'mean',
'active_days': 'mean',
'purchase_delay': 'mean',
'days_since_last_login': 'mean',
'user_id': 'count'
}).rename(columns={'user_id': 'count'})
print(churn_analysis)
# 分析各特征与流失的相关性
print("\n特征与流失的相关性:")
correlation = df[['amount', 'active_days', 'purchase_delay', 'days_since_last_login', 'is_churn']].corr()
print(correlation['is_churn'])
# 识别高风险用户(流失概率高的用户)
df['churn_risk_score'] = df['days_since_last_login'] * 0.5 + df['purchase_delay'] * 0.3 - df['amount'] * 0.001
print("\n添加流失风险评分后的数据:")
print(df[['user_id', 'amount', 'active_days', 'purchase_delay', 'days_since_last_login', 'is_churn', 'churn_risk_score']])
# 按风险评分排序
print("\n按流失风险评分排序:")
print(df.sort_values('churn_risk_score', ascending=False)[['user_id', 'churn_risk_score', 'is_churn']])
# 分析流失用户的购买行为
print("\n流失用户的购买行为分析:")
churned_users = df[df['is_churn'] == True]
non_churned_users = df[df['is_churn'] == False]
print(f"流失用户平均购买金额: {churned_users['amount'].mean():.2f}")
print(f"非流失用户平均购买金额: {non_churned_users['amount'].mean():.2f}")
print(f"流失用户平均购买延迟: {churned_users['purchase_delay'].mean():.2f} 天")
print(f"非流失用户平均购买延迟: {non_churned_users['purchase_delay'].mean():.2f} 天")