【发布时间】:2021-07-26 00:15:35
【问题描述】:
我有类似于这个的数据框。
import pandas as pd
import string
import random
def generate_example_dataframe()-> pd.DataFrame:
"""
This simple function will generate simple dataframe in long format
"""
num = 20 # number of regions udsed in simulations
subjects_num = 10
random.seed(1)
conditions = ["open", "closed"]
groups = ["old", "young"]
means = [1,1.5,1.25,1.75]
regions = [f"region_{s}" for s in string.ascii_letters[:num]]
subjects = [f"subject_{s}" for s in list(range(1, subjects_num))]
list_of_dataframes = []
for subject in subjects:
for region in regions:
lst = iter(means)
for condition in conditions:
for group in groups:
mean = next(lst)
values = mean + np.random.rand(num) + 0.2*random.random()
temp_df = pd.DataFrame({'region':[region] *num, 'group':[group] * num, 'condition':[condition] *num ,'subject':[subject] *num ,'values':values})
list_of_dataframes.append(temp_df)
return pd.concat(list_of_dataframes)
# %% [markdown]
# Genereting sample dataframe is presented in the long format - one obe
# %%
df = generate_example_dataframe()
df.head(10).to_clipboard(sep=',', index=True)
这样的输出
,region,group,condition,subject,values
0,region_a,old,open,subject_1,1.4914914311214753
1,region_a,old,open,subject_1,1.9742822483723783
2,region_a,old,open,subject_1,1.0461147549953116
3,region_a,old,open,subject_1,1.9369465073938947
4,region_a,old,open,subject_1,1.817792271839675
5,region_a,old,open,subject_1,1.4272522367426221
6,region_a,old,open,subject_1,1.129423554333859
7,region_a,old,open,subject_1,1.9021298911486018
8,region_a,old,open,subject_1,1.950500304961099
9,region_a,old,open,subject_1,1.6832358513116206
我想对按区域、组和条件分隔的值进行简单的 t 检验。 (测试数 = 区域 x 组 x 条件) 最pythonic的方法是什么? 我现在想的唯一方法是循环迭代这些变量的值并子集大数据框。
【问题讨论】:
-
感谢您提供重现 DataFrame 的方法!我是否认为每个观察都是独立的?
标签: python pandas pandas-groupby statistical-test