import pandas as pd
import numpy as np
df = pd.DataFrame({'id':[1001,1002,1003,1004,1005,1006],
'date':pd.date_range('20130102', periods=6),
'city':['Beijing','SH','Guangzhou ','Shenzhen','Shanghai', 'Beijing'],
'age':[23,44,54,32,34,32],
'category':['100-A','100-B','110-A','110-C','210-A','130-F'],
'price':[1200,np.nan,2133,5433,np.nan,4432]},
columns =['id','date','city','category','age','price'])
# 对所有列进行计数汇总
df.groupby('city').count()
# 对特定的ID列进行计数汇总
df.groupby('city')['id'].count()
# 对两个字段进行汇总计数
df.groupby(['city','gender'])['id'].count()
# 对city字段进行汇总并计算price的合计和均值
df.groupby('city')['price'].agg([len,np.sum, np.mean])
# 对city字段进行汇总并计算price均值和age最大值
df.groupby('city').agg(
{'price': np.mean, 'age': np.max})
# 对金额累计求和
df['amount'].cumsum()
# 分组累计求和
df['sum']=df.groupby(['user_id'])['cnt'].cumsum()
# 简单的数据采样
df.sample(n=3)
df.sample(100, random_state=10)
# 手动设置采样权重
weights = [0, 0, 0, 0, 0.5, 0.5]
df.sample(n=2, weights=weights)
# 采样后不放回
df.sample(n=6, replace=False)
# 采样后放回
df.sample(n=6, replace=True)
# 打乱数据顺序
from sklearn.utils import shuffle
df = shuffle(df, random_state=10).reset_index(drop=True)
# 下采样
from imblearn.under_sampling import RandomUnderSampler
RUS = RandomUnderSampler(random_state=10, ratio=0.3)
X_RUS, y_RUS = RUS.fit_sample(X_train, y_train)
X_RUS = pd.DataFrame(X_RUS, columns = list(X_train.columns))
y_RUS = pd.DataFrame(y_RUS, columns= {'y'})
y_RUS['y'].value_counts()
df_rus = y_RUS.join(X_RUS)
# 数据表描述性统计
df.describe().round(2).T
# 标准差
df['price'].std()
# 协方差
df['price'].cov(df['m-point'])
df.cov()
# 相关性分析
df['price'].corr(df['m-point'])
df.corr()
# 数据标准化
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
std.fit(df)
X_std = std.transform(df)
# 通过标准差找出异常值:平均值上下1.96个标准差区间以外的值
meangrade = df['a'].mean()
stdgrade = df['a'].std()
toprange = meangrade + stdgrade * 1.96
botrange = meangrade - stdgrade * 1.96
copydf = df
copydf = copydf.drop(copydf[copydf['a']> toprange].index)
copydf = copydf.drop(copydf[copydf['a']< botrange].index)
# 数据透视表
pd.pivot_table(df_inner,index=['city'],values=['price'],columns=['gender'], \
aggfunc=[len,np.sum], fill_value=0, margins=True)
# 文本变量取最后一个值
pd.pivot_table(df_inner, index=['gender'], values=['city'], aggfunc='last')
import datetime
df = pd.DataFrame({'A': ['one', 'one', 'two', 'three'] * 6,
'B': ['A', 'B', 'C'] * 8,
'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 4,
'D': np.random.randn(24),
'E': np.random.randn(24),
'F': [datetime.datetime(2013, i, 1) for i in range(1, 13)] +
[datetime.datetime(2013, i, 15) for i in range(1, 13)]})
pd.pivot_table(df, index=['A', 'B'], columns=['C'], values='D', aggfunc=np.sum)
pd.pivot_table(df, index=['C'], columns=['A', 'B'], values='D', aggfunc='sum')
pd.pivot_table(df, index=['A', 'B'], columns=['C'], values=['D','E'], aggfunc=np.sum)
pd.pivot_table(df, index=['A', 'B'], columns=['C'], values=['D','E'], aggfunc=[np.sum])
pd.pivot_table(df, index=['A', 'B'], columns=['C'], values=['D','E'], aggfunc={'D':len,'E':np.sum})
pd.pivot_table(df, index=['A', 'B'], columns=['C'], values=['D','E'], aggfunc={'D':len,'E':[np.sum, np.mean]})
pd.pivot_table(df, index=pd.Grouper(freq='M', key='F'), columns='C', values='D', aggfunc=np.sum) #类似 resample
# 数据透视表复杂操作
pv_x = pd.pivot_table(df, index=['商户','状态'],
values=['金额'],
columns = ['月份'],
aggfunc=[np.sum, len],
fill_value=0, margins=True)
pv_x.columns = pv_x.columns.droplevel(level=[1])
pv_x = pv_x.rename(columns={'sum':'金额','len':'件数'}, level=0)
pv_x = pv_x.T.sort_index(level=[1,0],ascending=[1,0]).T #按列名排序
pv_x.rename({'All':'合计'}, axis=1, inplace=True)
pv_x.rename({'All':'合计'}, level=0, inplace=True)
pv_x.rename({'':'合计'}, level=1, inplace=True)
pv_x.reindex(index=[('商户A','状态1'),
('商户A','状态2'),
('商户A','状态3'),
('商户B','状态1'),
('商户B','状态2'),
('商户B','状态3')]).fillna(0) #手动指定多层级排序
pv_x1 = pv_x.iloc[:-1,].reindex(index=['状态3','状态2','状态1','合计'],level=1) #按行排序
pv_x2 = pv_x.iloc[-1:,]
pv_x = pd.concat([pv_x1, pv_x2], axis=0)
pv_x = pv_x.swaplevel(axis=1)
# 用 concat上下拼接
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']},
index=[0, 1, 2, 3])
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
'B': ['B4', 'B5', 'B6', 'B7'],
'C': ['C4', 'C5', 'C6', 'C7'],
'D': ['D4', 'D5', 'D6', 'D7']},
index=[4, 5, 6, 7])
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
'B': ['B8', 'B9', 'B10', 'B11'],
'C': ['C8', 'C9', 'C10', 'C11'],
'D': ['D8', 'D9', 'D10', 'D11']},
index=[8, 9, 10, 11])
df_concat = pd.concat([df1, df2, df3])
# 用join根据index拼接
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
df_join = left.join(right)
# 用merge根据key左右拼接
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
df_merge = pd.merge(left, right, on=['key1', 'key2'])
| < <目录 | 07-常用函数> | 返回顶部 ↑ |