学习如何合并多个数据表,并计算综合指标。
| order_id | user_id | product_id | price | quantity | order_date |
|---|---|---|---|---|---|
| 1 | 101 | 201 | 100.0 | 2 | 2023-01-01 |
| 2 | 102 | 202 | 50.0 | 1 | 2023-01-02 |
| 3 | 101 | 203 | 150.0 | 1 | 2023-01-03 |
| 4 | 103 | 201 | 100.0 | 3 | 2023-01-04 |
| 5 | 102 | 202 | 50.0 | 2 | 2023-01-05 |
import pandas as pd
import numpy as np
# 读取订单数据
order_data = '''order_id,user_id,product_id,price,quantity,order_date
1,101,201,100.0,2,2023-01-01
2,102,202,50.0,1,2023-01-02
3,101,203,150.0,1,2023-01-03
4,103,201,100.0,3,2023-01-04
5,102,202,50.0,2,2023-01-05
'''
# 读取用户数据
user_data = '''user_id,username,age,gender
101,Alice,25,Female
102,Bob,30,Male
103,Charlie,35,Male
'''
# 读取产品数据
product_data = '''product_id,product_name,category
201,iPhone,Electronics
202,MacBook,Electronics
203,iPad,Electronics
'''
# 转换为DataFrame
from io import StringIO
orders = pd.read_csv(StringIO(order_data))
users = pd.read_csv(StringIO(user_data))
products = pd.read_csv(StringIO(product_data))
print("订单数据:")
print(orders)
print("\n用户数据:")
print(users)
print("\n产品数据:")
print(products)
# 合并订单和用户数据
order_user = pd.merge(orders, users, on='user_id', how='left')
print("\n订单-用户合并数据:")
print(order_user)
# 合并订单-用户和产品数据
order_user_product = pd.merge(order_user, products, on='product_id', how='left')
print("\n订单-用户-产品合并数据:")
print(order_user_product)
# 计算总销售额
order_user_product['total_amount'] = order_user_product['price'] * order_user_product['quantity']
print("\n添加总销售额后的数据:")
print(order_user_product)
# 按用户分组计算综合指标
user_metrics = order_user_product.groupby('user_id').agg({
'order_id': 'count',
'quantity': 'sum',
'total_amount': 'sum',
'price': 'mean'
}).rename(columns={
'order_id': 'order_count',
'price': 'avg_price'
})
print("\n用户综合指标:")
print(user_metrics)
# 按产品分组计算综合指标
product_metrics = order_user_product.groupby('product_name').agg({
'order_id': 'count',
'quantity': 'sum',
'total_amount': 'sum'
}).rename(columns={
'order_id': 'order_count'
})
print("\n产品综合指标:")
print(product_metrics)
# 按日期分组计算销售额
order_user_product['order_date'] = pd.to_datetime(order_user_product['order_date'])
daily_sales = order_user_product.groupby('order_date')['total_amount'].sum()
print("\n每日销售额:")
print(daily_sales)
# 计算每个用户的平均订单金额
user_metrics['avg_order_amount'] = user_metrics['total_amount'] / user_metrics['order_count']
print("\n用户平均订单金额:")
print(user_metrics[['order_count', 'total_amount', 'avg_order_amount']])
# 按性别分组计算消费情况
gender_metrics = order_user_product.groupby('gender').agg({
'order_id': 'count',
'quantity': 'sum',
'total_amount': 'sum'
}).rename(columns={
'order_id': 'order_count'
})
print("\n性别消费统计:")
print(gender_metrics)
# 计算产品的平均销售价格
product_metrics['avg_selling_price'] = product_metrics['total_amount'] / product_metrics['quantity']
print("\n产品平均销售价格:")
print(product_metrics[['order_count', 'quantity', 'total_amount', 'avg_selling_price']])