项目9:用户流失预警特征分析

分析用户流失的特征,构建流失预警模型。

数据预览

user_id reg_date order_date amount last_login
10012023-01-012023-01-0510002023-01-10
10022023-01-022023-01-035002023-01-03
10032023-01-032023-01-0615002023-01-15
10042023-01-042023-01-043002023-01-04
10052023-01-052023-01-078002023-01-12
10062023-01-062023-01-0612002023-01-06
10072023-01-072023-01-086002023-01-14
10082023-01-082023-01-084002023-01-08
10092023-01-092023-01-109002023-01-16
10102023-01-102023-01-107002023-01-10

代码编辑器

参考答案

import pandas as pd
import numpy as np

# 读取数据
data = '''user_id,reg_date,order_date,amount,last_login
1001,2023-01-01,2023-01-05,1000,2023-01-10
1002,2023-01-02,2023-01-03,500,2023-01-03
1003,2023-01-03,2023-01-06,1500,2023-01-15
1004,2023-01-04,2023-01-04,300,2023-01-04
1005,2023-01-05,2023-01-07,800,2023-01-12
1006,2023-01-06,2023-01-06,1200,2023-01-06
1007,2023-01-07,2023-01-08,600,2023-01-14
1008,2023-01-08,2023-01-08,400,2023-01-08
1009,2023-01-09,2023-01-10,900,2023-01-16
1010,2023-01-10,2023-01-10,700,2023-01-10
'''

# 转换为DataFrame
from io import StringIO
df = pd.read_csv(StringIO(data))

# 转换日期列
df['reg_date'] = pd.to_datetime(df['reg_date'])
df['order_date'] = pd.to_datetime(df['order_date'])
df['last_login'] = pd.to_datetime(df['last_login'])

print("原始数据:")
print(df)
print("\n数据基本信息:")
print(df.info())
print("\n数据描述性统计:")
print(df.describe())

# 计算用户活跃天数
df['active_days'] = (df['last_login'] - df['reg_date']).dt.days
print("\n添加活跃天数后的数据:")
print(df)

# 计算用户购买延迟
df['purchase_delay'] = (df['order_date'] - df['reg_date']).dt.days
print("\n添加购买延迟后的数据:")
print(df)

# 计算用户最近一次登录到当前的天数(假设当前日期为2023-01-20)
current_date = pd.to_datetime('2023-01-20')
df['days_since_last_login'] = (current_date - df['last_login']).dt.days
print("\n添加最近登录天数后的数据:")
print(df)

# 定义流失用户(假设超过7天未登录为流失)
df['is_churn'] = df['days_since_last_login'] > 7
print("\n添加流失标签后的数据:")
print(df)

# 分析流失用户特征
print("\n流失用户与非流失用户对比:")
churn_analysis = df.groupby('is_churn').agg({
    'amount': 'mean',
    'active_days': 'mean',
    'purchase_delay': 'mean',
    'days_since_last_login': 'mean',
    'user_id': 'count'
}).rename(columns={'user_id': 'count'})
print(churn_analysis)

# 分析各特征与流失的相关性
print("\n特征与流失的相关性:")
correlation = df[['amount', 'active_days', 'purchase_delay', 'days_since_last_login', 'is_churn']].corr()
print(correlation['is_churn'])

# 识别高风险用户(流失概率高的用户)
df['churn_risk_score'] = df['days_since_last_login'] * 0.5 + df['purchase_delay'] * 0.3 - df['amount'] * 0.001
print("\n添加流失风险评分后的数据:")
print(df[['user_id', 'amount', 'active_days', 'purchase_delay', 'days_since_last_login', 'is_churn', 'churn_risk_score']])

# 按风险评分排序
print("\n按流失风险评分排序:")
print(df.sort_values('churn_risk_score', ascending=False)[['user_id', 'churn_risk_score', 'is_churn']])

# 分析流失用户的购买行为
print("\n流失用户的购买行为分析:")
churned_users = df[df['is_churn'] == True]
non_churned_users = df[df['is_churn'] == False]
print(f"流失用户平均购买金额: {churned_users['amount'].mean():.2f}")
print(f"非流失用户平均购买金额: {non_churned_users['amount'].mean():.2f}")
print(f"流失用户平均购买延迟: {churned_users['purchase_delay'].mean():.2f} 天")
print(f"非流失用户平均购买延迟: {non_churned_users['purchase_delay'].mean():.2f} 天")
返回主页