【问题标题】:Merge two files and add computation and sorting the updated data in python合并两个文件并在python中添加计算和排序更新的数据
【发布时间】:2022-01-12 23:10:45
【问题描述】:

我需要帮助来制作下面的 sn-p。我需要合并两个文件并在匹配的行上执行计算

我有包含旧数据的 oldFile.txt 和包含更新数据集的 newFile.txt

我需要根据 newFile.txt 中的数据更新 oldFile.txt 并计算百分比变化。任何想法都会非常有帮助。提前致谢

from collections import defaultdict
num = 0
data=defaultdict(int)
with open("newFile.txt", encoding='utf8', errors='ignore') as f:
    for line in f:
        grp, pname, cnt, cat = line.split(maxsplit=3)
        data[(pname.strip(),cat.replace('\n','').strip(),grp,cat)]+=int(cnt)
        
sorteddata = sorted([[k[0],v,k[1],k[2]] for k,v in data.items()], key=lambda x:x[1], reverse=True)

for subl in sorteddata[:10]:
    num += 1
    line = " ".join(map(str, subl))
    print ("{:>5} -> {:>}".format(str(num), line))

    with open("oldFile.txt", 'a', encoding='utf8', errors='ignore') as l:
        l.write(" ".join(map(str, subl)) + '\n')

oldFile.txt

 #col1             #col2        #col3  #col4
 1,396 c15e89f2149bcc0cbd5fb204   4    HUH_Token (HUH)                      
   279 9e4d81c8fc15870b15aef8dc   3    BABY BNB (BBNB)                
   231 31b5c07636dab8f0909dbd2d   6    Buff Unicorn (BUFFUN...)             
   438 1c6bc8e962427deb4106ae06   8    Charge (Charge)                      
 2,739 6ea059a29eccecee4e250414   2    MAXIMACASH (MAXCAS...)

newFile.txt #-- 更新数据,增加 oldFile.txt 中没有的行

 #col1             #col2        #col3  #col4
 8,739 6ea059a29eccecee4e250414   60   MAXIMACASH (MAXCAS...)
   138 1c6bc8e962427deb4106ae06   50   Charge (Charge)                      
   860 31b5c07636dab8f0909dbd2d   40   Buff Unicorn (BUFFUN...)             
   200 9e4d81c8fc15870b15aef8dc   30   BABY BNB (BBNB)    #-- not found in the oldFile.txt
    20 5esdsds2sd15870b15aef8dc   30   CharliesAngel (CA)            
 1,560 c15e89f2149bcc0cbd5fb204   20   HUH_Token (HUH)     

需要改进:#-- 使用附加列 (col5, col6) 并根据 (col3) 值排序

 #col1             #col2        #col3      #col4                #col5 (oldFile-newFile)   #col6 (oldFile-newFile)
 8,739 6ea059a29eccecee4e250414  62   MAXIMACASH (MAXCAS...)   2900.00 % (col3 2-60)    219.06 % (col1 2,739-8,739) 
   138 1c6bc8e962427deb4106ae06  58   Charge (Charge)           625.00 % (col3 8-50)    -68.49 % (col1   438-138)      
   860 31b5c07636dab8f0909dbd2d  46   Buff Unicorn (BUFFUN...)  566.67 % (col3 6-40)    272.29 % (col1   231-860)
   200 9e4d81c8fc15870b15aef8dc  33   BABY BNB (BBNB)           900.00 % (col3 3-30)    -28.32 % (col1   279-200) 
    20 5esdsds2sd15870b15aef8dc  30   CharliesAngel (CA)          0.00 % (col3 0-30)     20.00 % (col1   0-20) 
 1,560 c15e89f2149bcc0cbd5fb204  24   HUH_Token (HUH)           400.00 % (col3 4-20)     11.75 % (col1 1,396-1,560)

【问题讨论】:

  • 你做了什么来尝试做出这些改变?
  • 如果您只读取文件,我建议您在调用open 时添加'r',如果您只是写入文件,请添加'w'。这将保护您免受错误的影响,例如意外损坏您想要阅读的文件;这将使您的代码更容易理解,因为无论您是在阅读还是在写作,都会立即显而易见。
  • 我认为你应该把阅读和写作完全抛开。除非您在这些方面遇到问题,否则请将您的问题简化为核心问题 - 获取两个多维数组并将其转换为单个多维数组。文件读/写很简单。根据您的实际数据转换,您尝试将两者合并吗?
  • @rbutrnz 所以你以后不要在社交上失礼......“IT”通常不会指代与“程序员”相同的技能。
  • 虽然如果你的同事知道你是一个“程序员”,你肯定会成为事实上的“IT人”。

标签: python sorting


【解决方案1】:

这是一个示例代码,用于输出您需要的内容。 我使用下面的公式来计算 pct 变化。 percentage_change = 100*(new-old)/old

如果 old 为 0 则改为 1 以避免除以零错误。

import pandas as pd


def read_file(fn):
    """
    Read file fn and convert data into a dict of dict.
    data = {pname1: {grp: grp1, pname: pname1, cnt: cnt1, cat: cat1},
            pname2: {gpr: grp2, ...} ...}
    """
    data = {}
    with open(fn, 'r') as f:
        for lines in f:
            line = lines.rstrip()
            grp, pname, cnt, cat = line.split(maxsplit=3)
            data.update({pname: {'grp': float(grp.replace(',', '')), 'pname': pname, 'cnt': int(cnt), 'cat': cat}})
            
    return data


def process_data(oldfn, newfn):  
    """
    Read old and new files, update the old file based on new file.
    Save output to text, and csv files.
    """
    # Get old and new data in dict.
    old = read_file(oldfn)
    new = read_file(newfn)

    # Update old data based on new data
    u_data = {}
    for ko, vo in old.items():
        if ko in new:
            n = new[ko]
            
            # Update cnt.
            old_cnt = vo['cnt']
            new_cnt = n['cnt']
            u_cnt = old_cnt + new_cnt

            # cnt change, if old is zero we set it to 1 to avoid division by zero error.
            tmp_old_cnt = 1 if old_cnt == 0 else old_cnt
            cnt_change = 100 * (new_cnt - tmp_old_cnt) / tmp_old_cnt

            # grp change
            old_grp = vo['grp']
            new_grp = n['grp']
            grp_change = 100 * (new_grp - old_grp) / old_grp

            u_data.update({ko: {'grp': n['grp'], 'pname': n['pname'], 'cnt': u_cnt, 'cat': n['cat'],
                                'cnt_change%': round(cnt_change, 2), 'grp_change%': round(grp_change, 2)}})

    # add new data to u_data, that is not in old data
    for kn, vn in new.items():
        if kn not in old:        
            # Since this is new item its old cnt is zero, we set it to 1 to avoid division by zero error.
            old_cnt = 1
            new_cnt = vn['cnt']
            cnt_change = 100 * (new_cnt - old_cnt) / old_cnt        

            # grp change is similar to cnt change
            old_grp = 1
            new_grp = vn['grp']
            grp_change = 100 * (new_grp - old_grp) / old_grp
            
            # Update new columns.
            vn.update({'cnt_change%': round(cnt_change, 2), 'grp_change%': round(grp_change, 2)})        
            u_data.update({kn: vn})
            
    # Create new data mydata list from u_data, and only extract the dict value.
    mydata = []
    for _, v in u_data.items():
        mydata.append(v)
        
    # Convert mydata into pandas dataframe to easier manage the data.
    df = pd.DataFrame(mydata)
    df = df.sort_values(by=['cnt'], ascending=False)  # sort on cnt column
    
    # Save to csv file.
    df.to_csv('output.csv', index=False)
    
    # Save to text file.
    with open('output.txt', 'w') as w:
        w.write(f'{df.to_string(index=False)}')
        
    # Print in console.    
    print(df.to_string(index=False))


# Start
oldfn = 'F:/Tmp/oldFile.txt'
newfn = 'F:/Tmp/newFile.txt'
process_data(oldfn, newfn)

控制台输出:

   grp                    pname  cnt                      cat  cnt_change%  grp_change%
8739.0 6ea059a29eccecee4e250414   62   MAXIMACASH (MAXCAS...)      2900.00       219.06
 138.0 1c6bc8e962427deb4106ae06   58          Charge (Charge)       525.00       -68.49
 860.0 31b5c07636dab8f0909dbd2d   46 Buff Unicorn (BUFFUN...)       566.67       272.29
 200.0 9e4d81c8fc15870b15aef8dc   33          BABY BNB (BBNB)       900.00       -28.32
  20.0 5esdsds2sd15870b15aef8dc   30       CharliesAngel (CA)      2900.00      1900.00
1560.0 c15e89f2149bcc0cbd5fb204   24          HUH_Token (HUH)       400.00        11.75

文本输出:

   grp                    pname  cnt                      cat  cnt_change%  grp_change%
8739.0 6ea059a29eccecee4e250414   62   MAXIMACASH (MAXCAS...)      2900.00       219.06
 138.0 1c6bc8e962427deb4106ae06   58          Charge (Charge)       525.00       -68.49
 860.0 31b5c07636dab8f0909dbd2d   46 Buff Unicorn (BUFFUN...)       566.67       272.29
 200.0 9e4d81c8fc15870b15aef8dc   33          BABY BNB (BBNB)       900.00       -28.32
  20.0 5esdsds2sd15870b15aef8dc   30       CharliesAngel (CA)      2900.00      1900.00
1560.0 c15e89f2149bcc0cbd5fb204   24          HUH_Token (HUH)       400.00        11.75

csv 输出:

grp,pname,cnt,cat,cnt_change%,grp_change%
8739.0,6ea059a29eccecee4e250414,62,MAXIMACASH (MAXCAS...),2900.0,219.06
138.0,1c6bc8e962427deb4106ae06,58,Charge (Charge),525.0,-68.49
860.0,31b5c07636dab8f0909dbd2d,46,Buff Unicorn (BUFFUN...),566.67,272.29
200.0,9e4d81c8fc15870b15aef8dc,33,BABY BNB (BBNB),900.0,-28.32
20.0,5esdsds2sd15870b15aef8dc,30,CharliesAngel (CA),2900.0,1900.0
1560.0,c15e89f2149bcc0cbd5fb204,24,HUH_Token (HUH),400.0,11.75

【讨论】:

  • 解释得很好。谢谢你。这足以让我们继续努力。
【解决方案2】:

只是提供基于convtools 的替代解决方案,您可能会在其中找到有用的部分:

from convtools import conversion as c
from convtools.contrib.tables import Table

# your percentage change calculation
def c_change(column_name):
    return c.if_(
        c.and_(
            c.col(f"{column_name}_LEFT"),
            c.col(f"{column_name}_RIGHT").is_not(None),
        ),
        (
            (c.col(f"{column_name}_RIGHT") - c.col(f"{column_name}_LEFT"))
            / c.col(f"{column_name}_LEFT")
            * 100.0
        ).pipe(round, 2),
        None,
    )

prepare_columns = {
    "COLUMN_0": c.col("COLUMN_0").as_type(float),
    "COLUMN_2": c.col("COLUMN_2").as_type(float),
}
dialect = Table.csv_dialect(delimiter="\t")

sorted_rows = sorted(
    Table.from_csv("tmp1.csv", dialect=dialect)
    .update(**prepare_columns)
    .join(
        Table.from_csv(
            "tmp2.csv",
            dialect=dialect,
        ).update(**prepare_columns),
        on=["COLUMN_1", "COLUMN_3"],
        how="full",
    )
    .update(
        COLUMN_4=c_change("COLUMN_2"),
        COLUMN_5=c_change("COLUMN_0"),
        COLUMN_2=c.col("COLUMN_2_RIGHT"),
        COLUMN_0=c.col("COLUMN_0_RIGHT"),
    )
    .take(
        "COLUMN_0",
        "COLUMN_1",
        "COLUMN_2",
        "COLUMN_3",
        "COLUMN_4",
        "COLUMN_5",
    )
    .into_iter_rows(tuple),
    key=lambda row: row[2],
    reverse=True,
)

Table.from_rows(sorted_rows).into_csv("tmp_result.csv", dialect=dialect)

结果:

COLUMN_0    COLUMN_1    COLUMN_2    COLUMN_3    COLUMN_4    COLUMN_5
8739.0  6ea059a29eccecee4e250414    60.0    MAXIMACASH (MAXCAS...)  2900.0  219.06
138.0   1c6bc8e962427deb4106ae06    50.0    Charge (Charge) 525.0   -68.49
860.0   31b5c07636dab8f0909dbd2d    40.0    Buff Unicorn (BUFFUN...)    566.67  272.29
200.0   9e4d81c8fc15870b15aef8dc    30.0    BABY BNB (BBNB) 900.0   -28.32
20.0    5esdsds2sd15870b15aef8dc    30.0    CharliesAngel (CA)      
1560.0  c15e89f2149bcc0cbd5fb204    20.0    HUH_Token (HUH) 400.0   11.75

【讨论】:

    猜你喜欢
    • 2014-11-09
    • 1970-01-01
    • 2017-09-11
    • 2021-04-05
    • 1970-01-01
    • 1970-01-01
    • 2015-01-30
    • 2015-04-17
    • 1970-01-01
    相关资源
    最近更新 更多