Python 数据处理，切片，替换，去重，排序

一、把下面这组数据进行处理，进行规则排序。

第一版代码：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57

#!/usr/local/python3/bin/python3
#定义了一个函数，用来做数据的分隔符转换，保证数据风格的一致，才能有效排序。

def sanitize(time_string):   

    if '-' in time_string:    

        splitter='-'

    elif ':' in time_string:

        splitter=':'

    else:

        return(time_string)
#将取到的一段数据，通过判断中定义的分割符切开为两组，分别报存到mins，secs变量中。

    (mins,secs) = time_string.split(splitter) 
#在返回结果的时候，完成字符串合并。

    return(mins + '.' + secs)   
 
#使用with open打开文件，和open不同于with open会自动关闭文件，不需要手动关闭。    

with open('james') as jam:

    data = jam.readline()
#把数据进行首尾去空格，使用，最为分隔符切分。    

james1 = data.strip().split(',')
#这里也可以通过这种方式排序，在最后输出的时候，直接输出james2就可以了。
#james2 = sorted(james1)
 

with open('julie') as jul:

    data = jul.readline()

julie1 = data.strip().split(',')
#julie2 = sorted(julie1)
 

with open('mikey') as mik:

    data = mik.readline()

mikey1 = data.strip().split(',')
#mikey2 = sorted(mikey1)
 

with open('sarah') as sar:

    data = sar.readline()

sarah1 = data.strip().split(',')
#sarah2 = sorted(sarah1)
 

clean_james=[]

clean_julie=[]

clean_mikey=[]

clean_sarah=[]
 
#通过迭代每组数据，调用sanitize函数，再把转换好的数据添加到新的列表中，这样列表中的数据风格就是一致的。

for each_t in james1:

    clean_james.append(sanitize(each_t))  

for each_t in julie1:

    clean_julie.append(sanitize(each_t))

for each_t in mikey1:

    clean_mikey.append(sanitize(each_t))

for each_t in sarah1:

    clean_sarah.append(sanitize(each_t))
 
#输出新列表并排序

print(sorted(clean_james)) 

print(sorted(clean_julie))

print(sorted(clean_mikey))

print(sorted(clean_sarah))

输出结果：

这就完成了规则排序。

二、需要给数据去重复，排序，只输出前三项数据。

第二版代码：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70

#!/usr/local/python3/bin/python3

def sanitize(time_string):

    if '-' in time_string:

        splitter='-'

    elif ':' in time_string:

        splitter=':'

    else:

        return(time_string)

    (mins,secs) = time_string.split(splitter)

    return(mins + '.' + secs)
 
#定义函数来解决去除重复数据项的问题，此函数接受两个列表作为参数带入，当数据不存在新列表中就把数据添加到新列表，如果有存在则不会添加，代替了下面使用每个列表使用for迭代的方式，代码更简洁。    

def pomoto(old_list,new_list):

    for i in old_list:

        if i not in new_list:

            new_list.append(i)
 

with open('james') as jam:

    data = jam.readline()

james1 = data.strip().split(',')
 

with open('julie') as jul:

    data = jul.readline()

julie1 = data.strip().split(',')
 

with open('mikey') as mik:

    data = mik.readline()

mikey1 = data.strip().split(',')
 

with open('sarah') as sar:

    data = sar.readline()

sarah1 = data.strip().split(',')
 
 

unique_james = []

unique_julie = []

unique_sarah = []

unique_mikey = []
 
#从迭代的方式改为了列表推导的方式

clean_james = sorted([sanitize(each_t) for each_t in james1])

clean_julie = sorted([sanitize(each_t) for each_t in julie1])

clean_mikey = sorted([sanitize(each_t) for each_t in mikey1])

clean_sarah = sorted([sanitize(each_t) for each_t in sarah1])
 
#调用函数完成去重复
pomoto(clean_james,unique_james)
pomoto(clean_julie,unique_julie)
pomoto(clean_mikey,unique_mikey)
pomoto(clean_sarah,unique_sarah)
 
#每列表迭代方式的去重复
#for i in clean_james:
#    if i not in unique_james:
#        unique_james.append(i)
#for i in clean_julie:
#    if i not in unique_julie:
#        unique_julie.append(i)
#for i in clean_mikey:
#    if i not in unique_mikey:
#        unique_mikey.append(i)
#for i in clean_sarah:
#    if i not in unique_sarah:
#        unique_sarah.append(i)
 
#输出去重复后的唯一新列表，只打印前三项数据

print(unique_james[0:3])

print(unique_julie[0:3])

print(unique_mikey[0:3])

print(unique_sarah[0:3])

输出结果：

三、使用集合删除重复项，将重复的with open定义为函数，简洁代码，进行逆序排序，输出前三项。

第三版代码：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31

#!/usr/local/python3/bin/python3

def sanitize(time_string):

    if '-' in time_string:

        splitter='-'

    elif ':' in time_string:

        splitter=':'

    else:

        return(time_string)

    (mins,secs) = time_string.split(splitter)

    return(mins + '.' + secs)

     
#定义函数打开文件获取数据返回，并加入了错误处理代码。

def get_file_data(filename):

    try:

        with open(filename) as f:

            data = f.readline()

        return(data.strip().split(','))

    except IOError as ioerr:

        print('File error' + str(ioerr))

        return(None)
 
#调用函数直接得到文件中经过切片后的数据。  

james1 = get_file_data('james')

julie1 = get_file_data('julie')

mikey1 = get_file_data('mikey')

sarah1 = get_file_data('sarah')
 

print(sorted(set([sanitize(i) for i in james1]),reverse=True)[0:3])

print(sorted(set([sanitize(i) for i in julie1]),reverse=True)[0:3])

print(sorted(set([sanitize(i) for i in mikey1]),reverse=True)[0:3])

print(sorted(set([sanitize(i) for i in sarah1]),reverse=True)[0:3])

输出结果：

定义get_file_data函数代替了，多行的with open，使得代码简洁。

Python 集合数据结构：集合中的数据项是无序的，而且不允许重复的，这和我们数学中的集合很像。

最后print输出代码：

本文转自qw87112 51CTO博客，原文链接:http://blog.51cto.com/tchuairen/1678435