使用关联规则分析购物篮数据,发现商品之间的关联关系。
| order_id | product_id | product_name | quantity | price |
|---|---|---|---|---|
| 1 | 101 | 牛奶 | 2 | 50.0 |
| 1 | 102 | 面包 | 1 | 30.0 |
| 1 | 103 | 鸡蛋 | 1 | 20.0 |
| 2 | 101 | 牛奶 | 1 | 50.0 |
| 2 | 102 | 面包 | 2 | 30.0 |
| 3 | 103 | 鸡蛋 | 2 | 20.0 |
| 3 | 104 | 黄油 | 1 | 40.0 |
| 4 | 101 | 牛奶 | 1 | 50.0 |
| 4 | 103 | 鸡蛋 | 1 | 20.0 |
| 4 | 104 | 黄油 | 1 | 40.0 |
| 5 | 102 | 面包 | 1 | 30.0 |
| 5 | 103 | 鸡蛋 | 1 | 20.0 |
| 6 | 101 | 牛奶 | 1 | 50.0 |
| 6 | 102 | 面包 | 1 | 30.0 |
| 6 | 103 | 鸡蛋 | 1 | 20.0 |
| 6 | 104 | 黄油 | 1 | 40.0 |
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules
# 读取数据
data = '''order_id,product_id,product_name,quantity,price
1,101,牛奶,2,50.0
1,102,面包,1,30.0
1,103,鸡蛋,1,20.0
2,101,牛奶,1,50.0
2,102,面包,2,30.0
3,103,鸡蛋,2,20.0
3,104,黄油,1,40.0
4,101,牛奶,1,50.0
4,103,鸡蛋,1,20.0
4,104,黄油,1,40.0
5,102,面包,1,30.0
5,103,鸡蛋,1,20.0
6,101,牛奶,1,50.0
6,102,面包,1,30.0
6,103,鸡蛋,1,20.0
6,104,黄油,1,40.0
'''
# 转换为DataFrame
from io import StringIO
df = pd.read_csv(StringIO(data))
print("原始数据:")
print(df)
# 数据预处理:转换为购物篮格式
basket = df.groupby(['order_id', 'product_name'])['quantity'].sum().unstack().reset_index().fillna(0).set_index('order_id')
# 转换为0-1编码(是否购买)
basket_encoded = basket.applymap(lambda x: 1 if x > 0 else 0)
print("\n购物篮编码数据:")
print(basket_encoded)
# 使用Apriori算法发现频繁项集
frequent_itemsets = apriori(basket_encoded, min_support=0.3, use_colnames=True)
print("\n频繁项集:")
print(frequent_itemsets)
# 生成关联规则
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
print("\n关联规则:")
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])
# 按lift值排序
rules_sorted = rules.sort_values('lift', ascending=False)
print("\n按Lift值排序的关联规则:")
print(rules_sorted[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10))
# 分析最常见的商品组合
print("\n最常见的商品组合:")
print(frequent_itemsets.sort_values('support', ascending=False).head(10))
# 分析支持度大于0.5的项集
print("\n支持度大于0.5的项集:")
print(frequent_itemsets[frequent_itemsets['support'] > 0.5])
# 分析置信度大于0.7的规则
print("\n置信度大于0.7的规则:")
print(rules[rules['confidence'] > 0.7][['antecedents', 'consequents', 'confidence']])