项目1:电商订单数据基础清洗与探索

学习如何清洗和探索电商订单数据,包括去重、处理缺失值等基础操作。

数据预览

user_id order_id order_date price quantity category
100120012023-01-01100.02Electronics
100220022023-01-0250.01Clothing
100320032023-01-03200.01Electronics
100120042023-01-0475.03Home
100220052023-01-05120.02Electronics
100320062023-01-0630.05Clothing
100120072023-01-07150.01Home
100220082023-01-0860.02Clothing
100320092023-01-09180.01Electronics
100120102023-01-1090.02Home

代码编辑器

参考答案

import pandas as pd
import numpy as np

# 读取数据
data = '''user_id,order_id,order_date,price,quantity,category
1001,2001,2023-01-01,100.0,2,Electronics
1002,2002,2023-01-02,50.0,1,Clothing
1003,2003,2023-01-03,200.0,1,Electronics
1001,2004,2023-01-04,75.0,3,Home
1002,2005,2023-01-05,120.0,2,Electronics
1003,2006,2023-01-06,30.0,5,Clothing
1001,2007,2023-01-07,150.0,1,Home
1002,2008,2023-01-08,60.0,2,Clothing
1003,2009,2023-01-09,180.0,1,Electronics
1001,2010,2023-01-10,90.0,2,Home
'''

# 转换为DataFrame
from io import StringIO
df = pd.read_csv(StringIO(data))

print("原始数据:")
print(df)
print("\n数据基本信息:")
print(df.info())
print("\n数据描述性统计:")
print(df.describe())

# 检查重复值
print("\n重复值数量:", df.duplicated().sum())

# 检查缺失值
print("\n缺失值情况:")
print(df.isnull().sum())

# 计算总销售额
df['total_amount'] = df['price'] * df['quantity']
print("\n添加总销售额后的数据集:")
print(df)

# 按类别分组统计
print("\n按类别统计:")
print(df.groupby('category').agg({
    'order_id': 'count',
    'quantity': 'sum',
    'total_amount': 'sum'
}))

# 按用户分组统计
print("\n按用户统计:")
print(df.groupby('user_id').agg({
    'order_id': 'count',
    'quantity': 'sum',
    'total_amount': 'sum'
}))

# 按日期排序
df_sorted = df.sort_values('order_date')
print("\n按日期排序后的数据:")
print(df_sorted)

# 计算每日销售额
daily_sales = df.groupby('order_date')['total_amount'].sum()
print("\n每日销售额:")
print(daily_sales)
返回主页