Pandas 数据分箱(cut / qcut)
数据分箱(也称为分桶)是将连续变量离散化的过程,常用于数据预处理、特征工程和数据分析。
cut 等宽分箱
cut 将数据划分为等宽的区间。
实例
import pandas as pd
import numpy as np
# 创建年龄数据
ages = pd.Series([5, 15, 25, 35, 45, 55, 65, 75, 85])
print("原始数据:")
print(ages.tolist())
print()
# 等宽分箱(5个区间)
bins = [0, 20, 40, 60, 80, 100]
labels = ["儿童", "青年", "中年", "中老年", "老年"]
age_bins = pd.cut(ages, bins=bins, labels=labels)
print("等宽分箱结果:")
print(age_bins)
print()
# 包含右边界
age_bins2 = pd.cut(ages, bins=4)
print("自动等宽分箱:")
print(age_bins2)
import numpy as np
# 创建年龄数据
ages = pd.Series([5, 15, 25, 35, 45, 55, 65, 75, 85])
print("原始数据:")
print(ages.tolist())
print()
# 等宽分箱(5个区间)
bins = [0, 20, 40, 60, 80, 100]
labels = ["儿童", "青年", "中年", "中老年", "老年"]
age_bins = pd.cut(ages, bins=bins, labels=labels)
print("等宽分箱结果:")
print(age_bins)
print()
# 包含右边界
age_bins2 = pd.cut(ages, bins=4)
print("自动等宽分箱:")
print(age_bins2)
返回类别和边界
实例
import pandas as pd
import numpy as np
ages = pd.Series([5, 15, 25, 35, 45])
# 返回区间索引
result = pd.cut(ages, bins=4, labels=False)
print("区间索引:")
print(result)
print()
# 返回区间边界
result = pd.cut(ages, bins=4, retbins=True)
print("区间边界:")
print(result[1])
import numpy as np
ages = pd.Series([5, 15, 25, 35, 45])
# 返回区间索引
result = pd.cut(ages, bins=4, labels=False)
print("区间索引:")
print(result)
print()
# 返回区间边界
result = pd.cut(ages, bins=4, retbins=True)
print("区间边界:")
print(result[1])
qcut 等频分箱
qcut 将数据划分为具有大致相同数据点的区间。
实例
import pandas as pd
import numpy as np
# 不均匀分布的数据
data = pd.Series([1, 1, 1, 2, 3, 4, 5, 10, 20, 30, 50, 100])
print("原始数据:")
print(data.tolist())
print()
# 等宽分箱(会导致分布不均)
cut_result = pd.cut(data, q=4)
print("等宽分箱:")
print(cut_result.value_counts())
print()
# 等频分箱(每个区间数据点数量大致相同)
qcut_result = pd.qcut(data, q=4)
print("等频分箱:")
print(qcut_result.value_counts())
import numpy as np
# 不均匀分布的数据
data = pd.Series([1, 1, 1, 2, 3, 4, 5, 10, 20, 30, 50, 100])
print("原始数据:")
print(data.tolist())
print()
# 等宽分箱(会导致分布不均)
cut_result = pd.cut(data, q=4)
print("等宽分箱:")
print(cut_result.value_counts())
print()
# 等频分箱(每个区间数据点数量大致相同)
qcut_result = pd.qcut(data, q=4)
print("等频分箱:")
print(qcut_result.value_counts())
指定分位数
实例
import pandas as pd
data = pd.Series(range(1, 101))
# 按指定分位数划分
result = pd.qcut(data, q=[0, 0.1, 0.3, 0.7, 0.9, 1])
print("按分位数划分:")
print(result.value_counts().sort_index())
data = pd.Series(range(1, 101))
# 按指定分位数划分
result = pd.qcut(data, q=[0, 0.1, 0.3, 0.7, 0.9, 1])
print("按分位数划分:")
print(result.value_counts().sort_index())
实战:数据分析
实例
import pandas as pd
import numpy as np
# 模拟客户消费数据
np.random.seed(42)
customers = pd.DataFrame({
"客户ID": range(1, 101),
"消费金额": np.random.exponential(500, 100) + 100
})
# 分箱为不同消费等级
customers["消费等级"] = pd.cut(
customers["消费金额"],
bins=[0, 300, 500, 800, float("inf")],
labels=["低", "中", "高", "VIP"]
)
# 统计各等级客户数
print("消费等级分布:")
print(customers["消费等级"].value_counts())
print()
# 按等级统计平均消费
print("各等级平均消费:")
print(customers.groupby("消费等级")["消费金额"].mean().round(2))
import numpy as np
# 模拟客户消费数据
np.random.seed(42)
customers = pd.DataFrame({
"客户ID": range(1, 101),
"消费金额": np.random.exponential(500, 100) + 100
})
# 分箱为不同消费等级
customers["消费等级"] = pd.cut(
customers["消费金额"],
bins=[0, 300, 500, 800, float("inf")],
labels=["低", "中", "高", "VIP"]
)
# 统计各等级客户数
print("消费等级分布:")
print(customers["消费等级"].value_counts())
print()
# 按等级统计平均消费
print("各等级平均消费:")
print(customers.groupby("消费等级")["消费金额"].mean().round(2))
