我更改了数据框 h2_2020 添加客户 4 以防新客户进入
# module
import pandas as pd
import numpy as np
# make a dataset
h1_2020={'customer':[1,1,2,3],
'product':'A B A A'.split()
}
h2_2020={'customer':[1,2,3,3,4],
'product':'A B C A A'.split()
}
h1_2020=pd.DataFrame(h1_2020)
print(h1_2020)
h2_2020=pd.DataFrame(h2_2020)
print(h2_2020)
所以我的结果有其他列指示客户是否是一半 2 中的新客户。
# drop_duplicates
dh1_2020=h1_2020.drop_duplicates()
dh2_2020=h2_2020.drop_duplicates()
# append and drop_duplicates to see what products customers bought during year
a=dh2_2020.append(dh1_2020).drop_duplicates().sort_values(['customer','product'])
print(a)
# check wheter custumers bought new product in half 2
check=a.merge(dh1_2020, on=['customer','product'],indicator=True,how='outer')
print(check)
# => 'left_only' means that the customer bought new product in half 2, but customer 4 is new customer in half 2
'''
#make a dummy column indicating a customer bought new product or not
and the other dummy column indicating if a customer is new customer in half 2 or not
'''
m1=pd.DataFrame(dh1_2020.customer.drop_duplicates())
print(m1)
m2=check.loc[check['_merge']=='left_only',['customer','_merge']].drop_duplicates()
print(m2)
whatwewant=m1.merge(m2,on='customer',how='outer', indicator='new_customer_hf2')
whatwewant['_merge']=1*(whatwewant['_merge']=='left_only')
whatwewant['new_customer_hf2']=1*(whatwewant['new_customer_hf2']=='right_only')
print(whatwewant)
# customer _merge new_customer_hf2
#0 1 0 0
#1 2 1 0
#2 3 1 0
#3 4 1 1