这应该可以解决问题:
import pandas as pd
DF = pd.DataFrame
S = pd.Series
def gt_60_mins_taken(df: DF) -> S:
assert str(df["Time Taken"].dtype) == "timedelta64[ns]"
mins_taken = df["Time Taken"].dt.total_seconds() / 60
return mins_taken > 60
def handle_single_user(dfg: DF) -> S:
n_uniq_issues = dfg["Issue ID"].nunique()
n_uniq_issues_where_took_gt_60mins = dfg.loc[gt_60_mins_taken, "Issue ID"].nunique()
frac_uniq_issues_took_gt_60mins = n_uniq_issues_where_took_gt_60mins / n_uniq_issues
data = dict(
n_uniq_issues=n_uniq_issues,
n_uniq_issues_where_took_gt_60mins=n_uniq_issues_where_took_gt_60mins,
frac_uniq_issues_took_gt_60mins=frac_uniq_issues_took_gt_60mins,
)
return S(data)
sample_data = {
"User ID": [0, 0, 1, 2, 2, 2],
"Issue ID": [100, 101, 103, 101, 100, 100],
"Time Taken": pd.to_timedelta(
["30 seconds", "70 minutes", "10 hours", "5 seconds", "8 minutes", "100 minutes"]
),
}
sample_df = DF(sample_data).astype(
{
"User ID": "int64",
"Issue ID": "int64",
"Time Taken": "timedelta64[ns]",
}
)
output_df = (
sample_df.groupby("User ID")
.apply(handle_single_user)
.reset_index()
.astype(
{
"User ID": "int64",
"n_uniq_issues": "int64",
"n_uniq_issues_where_took_gt_60mins": "int64",
"frac_uniq_issues_took_gt_60mins": "float64",
}
)
)
expected_output_df = DF(
{
"User ID": [0, 1, 2],
"n_uniq_issues": [2, 1, 2],
"n_uniq_issues_where_took_gt_60mins": [1, 1, 1],
"frac_uniq_issues_took_gt_60mins": [0.5, 1.0, 0.5],
}
)
pd.testing.assert_frame_equal(output_df, expected_output_df)
output_df.to_excel("/path/to/doc.xlsx")