Scipy 和 Sklearn Yeo-Johnson 归一化结果不匹配答案

【问题标题】：Scipy and Sklearn Yeo-Johnson normalization results do not matchScipy 和 Sklearn Yeo-Johnson 归一化结果不匹配
【发布时间】：2021-09-21 07:26:34
【问题描述】：

我正在运行 Yeo Johnson Transform，并按照 Scipy 网站上给出的示例进行操作。 Scipy link 我还将它与 Sklearn 实现进行了比较。这是代码：我

    import seaborn as sns
    from sklearn.preprocessing import PowerTransformer
    from scipy import stats
    import matplotlib.pyplot as plt
    import numpy as np

    fig = plt.figure( figsize=(10,10))
    ax1 = fig.add_subplot(421)
    x = stats.loggamma.rvs(5, size=500) + 5
    prob = stats.probplot(x, dist=stats.norm, plot=ax1)
    ax1.set_xlabel('')
    ax1.set_title('Probplot')

    ax2 = fig.add_subplot(422)
    sns.distplot(x, color="skyblue")
    ax2.set_title('Distribution of Data')

    ax3 = fig.add_subplot(423)
    xt_scipy, lmbda = stats.yeojohnson(x)
    prob = stats.probplot(xt_scipy, dist=stats.norm, plot=ax3)
    ax3.set_xlabel('')
    ax3.set_title('Probplot:Yeo-Johnson:Scipy')

    ax4 = fig.add_subplot(424)
    sns.distplot(xt_scipy, color="skyblue")
    ax4.set_title('Distribution of Transformed Data')

    ax5 = fig.add_subplot(425)
    pt = PowerTransformer(method = 'yeo-johnson',standardize = True)
    xt_sklearn = pt.fit_transform(x.reshape(-1,1))
    prob = stats.probplot(xt_sklearn.flatten(), dist=stats.norm, plot=ax5)
    ax5.set_xlabel('')
    ax5.set_title('Probplot:Yeo-Johnson:Sklearn')

    ax6 = fig.add_subplot(426)
    sns.distplot(xt_sklearn, color="skyblue")
    ax6.set_title('Distribution of Transformed Data')
    plt.tight_layout(h_pad=0.9, w_pad=0.9)
    plt.show()

查看附图，可以看出两种方法似乎都按预期对数据进行了归一化，从分位数可以看出。
但是，两个库的转换数据分布图虽然形状相同，但值范围不同。为什么转换后的值不同？哪一个对应于真正的 Yeo Johnson 公式？赛迪

【问题讨论】：

标签： python scipy normalization

【解决方案1】：

这是我的错。我没有意识到 Sklearn 在默认情况下在 Power 转换后会进行标准缩放。这是创建匹配结果的代码的修改。

    import seaborn as sns
    import sklearn.preprocessing
    from sklearn.preprocessing import PowerTransformer, StandardScaler
    from scipy import stats
    import matplotlib.pyplot as plt
    import numpy as np

    ss = StandardScaler()
    fig = plt.figure( figsize=(10,10))
    ax1 = fig.add_subplot(441)
    x = stats.loggamma.rvs(5, size=500) + 5
    prob = stats.probplot(x, dist=stats.norm, plot=ax1)
    ax1.set_xlabel('')
    ax1.set_title('Probplot')

    ax2 = fig.add_subplot(442)
    sns.distplot(x, color="skyblue")
    ax2.set_title('Distribution of Data')

    ax5 = fig.add_subplot(445)
    xt_scipy, lmbda = stats.yeojohnson(x)
    prob = stats.probplot(xt_scipy, dist=stats.norm, plot=ax5)
    ax5.set_xlabel('')
    ax5.set_title('Probplot:Yeo-Johnson:Scipy')

    ax6 = fig.add_subplot(446)
    sns.distplot(xt_scipy, color="skyblue")
    ax6.set_title('Distribution of Transformed Data')

    ax7 = fig.add_subplot(447)
    xt_scipy_ss, lmbda = stats.yeojohnson(x)
    xt_scipy_ss = ss.fit_transform(xt_scipy_ss.reshape(-1, 1))
    prob = stats.probplot(xt_scipy_ss.flatten(), dist=stats.norm, plot=ax7)
    ax7.set_xlabel('')
    ax7.set_title('Probplot:Yeo-Johnson + Stand Scal:Scipy')

    ax8 = fig.add_subplot(448)
    sns.distplot(xt_scipy_ss, color="skyblue")
    ax8.set_title('Distribution of Transformed Data')

    ax9 = fig.add_subplot(449)
    pt = PowerTransformer(method = 'yeo-johnson',standardize = False)
    xt_sklearn = pt.fit_transform(x.reshape(-1,1))
    prob = stats.probplot(xt_sklearn.flatten(), dist=stats.norm, plot=ax9)
    ax9.set_xlabel('')
    ax9.set_title('Probplot:Yeo-Johnson:Sklearn')

    ax10 = fig.add_subplot(4,4,10)
    sns.distplot(xt_sklearn, color="skyblue")
    ax10.set_title('Distribution of Transformed Data')

    ax11 = fig.add_subplot(4,4,11)
    pt = PowerTransformer(method='yeo-johnson', standardize=True)
    xt_sklearn_ss = pt.fit_transform(x.reshape(-1, 1))
    prob = stats.probplot(xt_sklearn_ss.flatten(), dist=stats.norm, plot=ax11)
    ax11.set_xlabel('')
    ax11.set_title('Probplot:Yeo-Johnson:Sklearn with Stand Scal')

    ax12 = fig.add_subplot(4, 4, 12)
    sns.distplot(xt_sklearn_ss, color="skyblue")
    ax12.set_title('Distribution of Transformed Data')
    plt.tight_layout(h_pad=0.9, w_pad=0.9)
    plt.show()

【讨论】：