学习如何清洗和探索电商订单数据,包括去重、处理缺失值等基础操作。
| user_id | order_id | order_date | price | quantity | category |
|---|---|---|---|---|---|
| 1001 | 2001 | 2023-01-01 | 100.0 | 2 | Electronics |
| 1002 | 2002 | 2023-01-02 | 50.0 | 1 | Clothing |
| 1003 | 2003 | 2023-01-03 | 200.0 | 1 | Electronics |
| 1001 | 2004 | 2023-01-04 | 75.0 | 3 | Home |
| 1002 | 2005 | 2023-01-05 | 120.0 | 2 | Electronics |
| 1003 | 2006 | 2023-01-06 | 30.0 | 5 | Clothing |
| 1001 | 2007 | 2023-01-07 | 150.0 | 1 | Home |
| 1002 | 2008 | 2023-01-08 | 60.0 | 2 | Clothing |
| 1003 | 2009 | 2023-01-09 | 180.0 | 1 | Electronics |
| 1001 | 2010 | 2023-01-10 | 90.0 | 2 | Home |
import pandas as pd
import numpy as np
# 读取数据
data = '''user_id,order_id,order_date,price,quantity,category
1001,2001,2023-01-01,100.0,2,Electronics
1002,2002,2023-01-02,50.0,1,Clothing
1003,2003,2023-01-03,200.0,1,Electronics
1001,2004,2023-01-04,75.0,3,Home
1002,2005,2023-01-05,120.0,2,Electronics
1003,2006,2023-01-06,30.0,5,Clothing
1001,2007,2023-01-07,150.0,1,Home
1002,2008,2023-01-08,60.0,2,Clothing
1003,2009,2023-01-09,180.0,1,Electronics
1001,2010,2023-01-10,90.0,2,Home
'''
# 转换为DataFrame
from io import StringIO
df = pd.read_csv(StringIO(data))
print("原始数据:")
print(df)
print("\n数据基本信息:")
print(df.info())
print("\n数据描述性统计:")
print(df.describe())
# 检查重复值
print("\n重复值数量:", df.duplicated().sum())
# 检查缺失值
print("\n缺失值情况:")
print(df.isnull().sum())
# 计算总销售额
df['total_amount'] = df['price'] * df['quantity']
print("\n添加总销售额后的数据集:")
print(df)
# 按类别分组统计
print("\n按类别统计:")
print(df.groupby('category').agg({
'order_id': 'count',
'quantity': 'sum',
'total_amount': 'sum'
}))
# 按用户分组统计
print("\n按用户统计:")
print(df.groupby('user_id').agg({
'order_id': 'count',
'quantity': 'sum',
'total_amount': 'sum'
}))
# 按日期排序
df_sorted = df.sort_values('order_date')
print("\n按日期排序后的数据:")
print(df_sorted)
# 计算每日销售额
daily_sales = df.groupby('order_date')['total_amount'].sum()
print("\n每日销售额:")
print(daily_sales)