机器学习入门(1)------python基础

环境：Python 3.6.4 |Anaconda, Inc.

Python常用容器类型

1.list

 1 l = [1, \'a\', 2, \'b\']
 2 print(type(l))
 3 print(\'修改前：\', l)
 4 
 5 # 修改list的内容
 6 l[0] = 3
 7 print(\'修改后：\', l)
 8 
 9 # 末尾添加元素
10 l.append(4)
11 print(\'添加后：\', l)
12 
13 # 遍历list
14 print(\'遍历list(for循环)：\')
15 for item in l:
16     print(item)
17     
18 # 通过索引遍历list
19 print(\'遍历list(while循环)：\')
20 i = 0
21 while i != len(l):
22     print(l[i])
23     i += 1
24     
25 # 列表合并
26 print(\'列表合并(+)：\', [1, 2] + [3, 4])
27 
28 # 列表重复
29 print(\'列表重复(*)：\', [1, 2] * 5)
30 
31 # 判断元素是否在列表中
32 print(\'判断元素存在(in)：\', 1 in [1, 2])

<class \'list\'>
修改前： [1, \'a\', 2, \'b\']
修改后： [3, \'a\', 2, \'b\']
添加后： [3, \'a\', 2, \'b\', 4]
遍历list(for循环)：
3
a
2
b
4
遍历list(while循环)：
3
a
2
b
4
列表合并(+)： [1, 2, 3, 4]
列表重复(*)： [1, 2, 1, 2, 1, 2, 1, 2, 1, 2]
判断元素存在(in)： True

2.tuple

 1 t = (1, \'a\', 2, \'b\')
 2 print(type(t))
 3 
 4 #元组的内容不能修改，否则会报错
 5 # t[0] = 3 
 6 
 7 # 遍历tuple
 8 print(\'遍历list(for循环)：\')
 9 for item in t:
10     print(item)
11     
12 # 通过索引遍历tuple
13 print(\'遍历tuple(while循环)：\')
14 i = 0
15 while i != len(t):
16     print(t[i])
17     i += 1
18     
19 # 解包 unpack
20 a, b, _, _ = t
21 print(\'unpack: \', c)
22 
23 # 确保unpack接收的变量个数和tuple的长度相同，否则报错
24 # 经常出现在函数返回值的赋值时
25 # a, b, c = t

<class \'tuple\'>
遍历list(for循环)：
1
a
2
b
遍历tuple(while循环)：
1
a
2
b

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-88506d8b1a51> in <module>()
     19 # 解包 unpack
     20 a, b, _, _ = t
---> 21 print(\'unpack: \', c)
     22 
     23 # 确保unpack接收的变量个数和tuple的长度相同，否则报错

NameError: name \'c\' is not defined

3.dictionary

 1 d = {\'小象学院\': \'http://www.chinahadoop.cn/\',
 2     \'百度\': \'https://www.baidu.com/\',
 3     \'阿里巴巴\': \'https://www.alibaba.com/\',
 4     \'腾讯\': \'https://www.tencent.com/\'}
 5 
 6 print(\'通过key获取value: \', d[\'小象学院\'])
 7 
 8 # 遍历key
 9 print(\'遍历key: \')
10 for key in d.keys():
11     print(key)
12     
13 # 遍历value
14 print(\'遍历value: \')
15 for value in d.values():
16     print(value)
17     
18 # 遍历item
19 print(\'遍历item: \')
20 for key, value in d.items():
21     print(key + \': \' + value)
22 
23 # format输出格式
24 print(\'format输出格式：\')
25 for key, value in d.items():
26     print(\'{}的网址是{}\'.format(key, value))

通过key获取value:  http://www.chinahadoop.cn/
遍历key: 
小象学院
百度
阿里巴巴
腾讯
遍历value: 
http://www.chinahadoop.cn/
https://www.baidu.com/
https://www.alibaba.com/
https://www.tencent.com/
遍历item: 
小象学院: http://www.chinahadoop.cn/
百度: https://www.baidu.com/
阿里巴巴: https://www.alibaba.com/
腾讯: https://www.tencent.com/
format输出格式：
小象学院的网址是http://www.chinahadoop.cn/
百度的网址是https://www.baidu.com/
阿里巴巴的网址是https://www.alibaba.com/
腾讯的网址是https://www.tencent.com/

4.set

 1 print(\'创建set:\')
 2 my_set = {1, 2, 3}
 3 print(my_set)
 4 my_set = set([1, 2, 3, 2])
 5 print(my_set)
 6 
 7 print(\'添加单个元素:\')
 8 my_set.add(3)
 9 print(\'添加3\', my_set)
10 
11 my_set.add(4)
12 print(\'添加4\', my_set)
13 
14 print(\'添加多个元素：\')
15 my_set.update([4, 5, 6])
16 print(my_set)

创建set:
{1, 2, 3}
{1, 2, 3}
添加单个元素:
添加3 {1, 2, 3}
添加4 {1, 2, 3, 4}
添加多个元素：
{1, 2, 3, 4, 5, 6}

5.Counter

初始化

1 import collections
2 
3 c1 = collections.Counter([\'a\', \'b\', \'c\', \'a\', \'b\', \'b\'])
4 c2 = collections.Counter({\'a\':2, \'b\':3, \'c\':1})
5 c3 = collections.Counter(a=2, b=3, c=1)
6 
7 print(c1)
8 print(c2)
9 print(c3)

Counter({\'b\': 3, \'a\': 2, \'c\': 1})
Counter({\'b\': 3, \'a\': 2, \'c\': 1})
Counter({\'b\': 3, \'a\': 2, \'c\': 1})

更新内容

1 # 注意这里是做“加法”，不是“替换”
2 c1.update({\'a\': 4, \'c\': -2, \'d\': 4})
3 print(c1)

Counter({\'a\': 6, \'d\': 4, \'b\': 3, \'c\': -1})

访问内容

1 print(\'a=\', c1[\'a\'])
2 print(\'b=\', c1[\'b\'])
3 # 对比和dict的区别
4 print(\'e=\', c1[\'e\'])

a= 6
b= 3
e= 0

element()方法

1 for element in c1.elements():
2     print(element)

d
d
d
d
b
b
b
a
a
a
a
a
a

most_common()方法

1 c1.most_common(3)
2 [(\'a\', 6), (\'d\', 4), (\'b\', 3)]

6.defaultdict

1 # 统计每个字母出现的次数
2 s = \'chinadoop\'
3 
4 # 使用Counter
5 print(collections.Counter(s))

Counter({\'o\': 2, \'d\': 1, \'c\': 1, \'p\': 1, \'a\': 1, \'n\': 1, \'h\': 1, \'i\': 1})

1 # 使用dict
2 counter = {}
3 for c in s:
4     if c not in counter:
5         counter[c] = 1
6     else:
7         counter[c] += 1
8         
9 print(counter.items())

dict_items([(\'d\', 1), (\'c\', 1), (\'p\', 1), (\'a\', 1), (\'o\', 2), (\'n\', 1), (\'h\', 1), (\'i\', 1)])

1 # 使用defaultdict
2 counter2 = collections.defaultdict(int)
3 for c in s:
4     counter2[c] += 1
5 print(counter2.items())

dict_items([(\'d\', 1), (\'c\', 1), (\'p\', 1), (\'a\', 1), (\'o\', 2), (\'n\', 1), (\'h\', 1), (\'i\', 1)])

1 # 记录相同元素的列表
2 colors = [(\'yellow\', 1), (\'blue\', 2), (\'yellow\', 3), (\'blue\', 4), (\'red\', 1)]
3 d = collections.defaultdict(list)
4 for k, v in colors:
5     d[k].append(v)
6 
7 print(d.items())

dict_items([(\'blue\', [2, 4]), (\'yellow\', [1, 3]), (\'red\', [1])])

7.map函数

 1 import math
 2 
 3 print(\'示例1，获取两个列表对应位置上的最小值：\')
 4 l1 = [1, 3, 5, 7, 9]
 5 l2 = [2, 4, 6, 6, 9]
 6 mins = map(min, l1, l2)
 7 print(mins)
 8 
 9 # map()函数操作时，直到访问数据时才会执行
10 for item in mins:
11     print(item)
12 
13 print(\'示例2，对列表中的元素进行平方根操作：\')
14 squared = map(math.sqrt, l2)
15 print(squared)
16 print(list(squared))

示例1，获取两个列表对应位置上的最小值：
<map object at 0x0000019AF8B0CDD8>
1
3
5
6
9
示例2，对列表中的元素进行平方根操作：
<map object at 0x0000019AF8A79DD8>
[1.4142135623730951, 2.0, 2.449489742783178, 2.449489742783178, 3.0]

8.匿名函数lambda

 1 # my_func = lambda a, b, c: a * b
 2 # print(my_func)
 3 # print(my_func(1, 2, 3))
 4 
 5 # 结合map
 6 print(\'lambda结合map\')
 7 l1 = [1, 3, 5, 7, 9]
 8 l2 = [2, 4, 6, 8, 10]
 9 result = map(lambda x, y: x * 2 + y, l1, l2)
10 print(list(result))

lambda结合map
[4, 10, 16, 22, 28]

9.python操作csv数据文件

1 import csv
2 
3 with open(\'grades.csv\') as csvfile:
4     grades_data = list(csv.DictReader(csvfile))
5     
6 print(\'记录个数：\', len(grades_data))
7 print(\'前2条记录：\', grades_data[:2])
8 print(\'列名：\', list(grades_data[0].keys()))

记录个数： 2315
前2条记录： [OrderedDict([(\'student_id\', \'B73F2C11-70F0-E37D-8B10-1D20AFED50B1\'), (\'assignment1_grade\', \'92.73394640624123\'), (\'assignment1_submission\', \'2015-11-02 06:55:34.282000000\'), (\'assignment2_grade\', \'83.03055176561709\'), (\'assignment2_submission\', \'2015-11-09 02:22:58.938000000\'), (\'assignment3_grade\', \'67.16444141249367\'), (\'assignment3_submission\', \'2015-11-12 08:58:33.998000000\'), (\'assignment4_grade\', \'53.01155312999494\'), (\'assignment4_submission\', \'2015-11-16 01:21:24.663000000\'), (\'assignment5_grade\', \'47.710397816995446\'), (\'assignment5_submission\', \'2015-11-20 13:24:59.692000000\'), (\'assignment6_grade\', \'38.16831825359636\'), (\'assignment6_submission\', \'2015-11-22 18:31:15.934000000\')]), OrderedDict([(\'student_id\', \'98A0FAE0-A19A-13D2-4BB5-CFBFD94031D1\'), (\'assignment1_grade\', \'86.79082085792986\'), (\'assignment1_submission\', \'2015-11-29 14:57:44.429000000\'), (\'assignment2_grade\', \'86.29082085792986\'), (\'assignment2_submission\', \'2015-12-06 17:41:18.449000000\'), (\'assignment3_grade\', \'69.7726566863439\'), (\'assignment3_submission\', \'2015-12-10 08:54:55.904000000\'), (\'assignment4_grade\', \'55.0981253490751\'), (\'assignment4_submission\', \'2015-12-13 17:32:30.941000000\'), (\'assignment5_grade\', \'49.5883128141676\'), (\'assignment5_submission\', \'2015-12-19 23:26:39.285000000\'), (\'assignment6_grade\', \'44.62948153275085\'), (\'assignment6_submission\', \'2015-12-21 17:07:24.275000000\')])]
列名： [\'student_id\', \'assignment1_grade\', \'assignment1_submission\', \'assignment2_grade\', \'assignment2_submission\', \'assignment3_grade\', \'assignment3_submission\', \'assignment4_grade\', \'assignment4_submission\', \'assignment5_grade\', \'assignment5_submission\', \'assignment6_grade\', \'assignment6_submission\']

1 avg_assign1 = sum([float(row[\'assignment1_grade\']) for row in grades_data]) / len(grades_data) 
2 print(\'assignment1平均分数：\', avg_assign1)

assignment1平均分数： 74.5357320747794

1 assign1_sub_month = set(row[\'assignment1_submission\'][:7] for row in grades_data)
2 print(assign1_sub_month)

{\'2016-02\', \'2015-09\', \'2016-01\', \'2016-04\', \'2016-03\', \'2016-06\', \'2016-08\', \'2015-10\', \'2016-05\', \'2016-07\', \'2015-12\', \'2015-11\'}

科学计算库NumPy

1 import numpy as np

1. 创建Array

1 my_list = [1, 2, 3]
2 x = np.array(my_list)
3 
4 print(\'列表：\', my_list)
5 print(\'Array: \', x)

列表： [1, 2, 3]
Array:  [1 2 3]

1 np.array([1, 2, 3]) - np.array([4, 5, 6])

array([-3, -3, -3])

1 m = np.array([[1, 2, 3], [4, 5, 6]])
2 print(m)
3 print(\'shape: \', m.shape)

[[1 2 3]
 [4 5 6]]
shape:  (2, 3)

1 n = np.arange(0, 30, 2)
2 print(n)

[ 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28]

1 n = n.reshape(3, 5)
2 print(\'reshape后: \')
3 print(n)

reshape后: 
[[ 0  2  4  6  8]
 [10 12 14 16 18]
 [20 22 24 26 28]]

1 print(\'ones:\n\', np.ones((3, 2)))
2 print(\'zeros:\n\', np.zeros((3, 2)))
3 print(\'eye:\n\', np.eye(3))
4 print(\'diag:\n\', np.diag(my_list))

ones:
 [[1. 1.]
 [1. 1.]
 [1. 1.]]
zeros:
 [[0. 0.]
 [0. 0.]
 [0. 0.]]
eye:
 [[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]
diag:
 [[1 0 0]
 [0 2 0]
 [0 0 3]]

1 print(\'*操作：\n\', np.array([1, 2, 3] * 3))
2 print(\'repeat：\n\', np.repeat([1, 2, 3], 3))

*操作：
 [1 2 3 1 2 3 1 2 3]
repeat：
 [1 1 1 2 2 2 3 3 3]

1 p1 = np.ones((3, 3))
2 p2 = np.arange(9).reshape(3, 3)
3 print(\'纵向叠加: \n\', np.vstack((p1, p2)))
4 print(\'横向叠加: \n\', np.hstack((p1, p2)))

纵向叠加: 
 [[ 1.  1.  1.]
 [ 1.  1.  1.]
 [ 1.  1.  1.]
 [ 0.  1.  2.]
 [ 3.  4.  5.]
 [ 6.  7.  8.]]
横向叠加: 
 [[ 1.  1.  1.  0.  1.  2.]
 [ 1.  1.  1.  3.  4.  5.]
 [ 1.  1.  1.  6.  7.  8.]]

2. Array操作

1 p1 = np.array([[1, 1, 1], [1, 1, 1],[1,1,1]])
2 p2 = np.arange(9).reshape(3, 3)3 print(\'p1: \n\', p1)
4 print(\'p2: \n\', p2)
5 
6 print(\'p1 + p2 = \n\', p1 + p2)
7 print(\'p1 * p2 = \n\', p1 * p2)
8 print(\'p2^2 = \n\', p2 ** 2)
9 print(\'p1.p2 = \n\', p1.dot(p2))

p1: 
 [[1 1 1]
 [1 1 1]
 [1 1 1]]
p2: 
 [[0 1 2]
 [3 4 5]
 [6 7 8]]
p1 + p2 = 
 [[1 2 3]
 [4 5 6]
 [7 8 9]]
p1 * p2 = 
 [[0 1 2]
 [3 4 5]
 [6 7 8]]
p2^2 = 
 [[ 0  1  4]
 [ 9 16 25]
 [36 49 64]]
p1.p2 = 
 [[ 9 12 15]
 [ 9 12 15]
 [ 9 12 15]]

1 p3 = np.arange(6).reshape(2, 3)
2 print(\'p3形状: \', p3.shape)
3 print(p3)
4 p4 = p3.T
5 print(\'转置后p3形状: \', p4.shape)
6 print(p4)

p3形状:  (2, 3)
[[0 1 2]
 [3 4 5]]
转置后p3形状:  (3, 2)
[[0 3]
 [1 4]
 [2 5]]

1 p3 = np.arange(6).reshape(2, 3)
2 print(\'p3数据类型:\', p3.dtype)
3 print(p3)
4 
5 p5 = p3.astype(\'float\')
6 print(\'p5数据类型:\', p5.dtype)
7 print(p5)

p3数据类型: int32
[[0 1 2]
 [3 4 5]]
p5数据类型: float64
[[0. 1. 2.]
 [3. 4. 5.]]

a = np.array([-4, -2, 1, 3, 5])
print(\'sum: \', a.sum())
print(\'min: \', a.min())
print(\'max: \', a.max())
print(\'mean: \', a.mean())
print(\'std: \', a.std()) //标准差
print(\'argmax: \', a.argmax()) //argmax(f(x))是使得 f(x)取得最大值所对应的变量x
print(\'argmin: \', a.argmin()) //argmax(f(x))是使得 f(x)取得最小值所对应的变量x

sum:  3
min:  -4
max:  5
mean:  0.6
std:  3.2619012860600183
argmax:  4
argmin:  0

3. 索引与切片

1 # 一维array
2 s = np.arange(13) ** 2
3 print(\'s: \', s)
4 print(\'s[0]: \', s[0])
5 print(\'s[4]: \', s[4])
6 print(\'s[0:3]: \', s[0:3])
7 print(\'s[[0, 2, 4]]: \', s[[0, 2, 4]])

s:  [  0   1   4   9  16  25  36  49  64  81 100 121 144]
s[0]:  0
s[4]:  16
s[0:3]:  [0 1 4]
s[[0, 2, 4]]:  [ 0  4 16]

1 # 二维array
2 r = np.arange(36).reshape((6, 6))
3 print(\'r: \n\', r)
4 print(\'r[2, 2]: \n\', r[2, 2]) //对应矩阵第三行第三列
5 print(\'r[3, 3:6]: \n\', r[3, 3:6]) //对应第四行第四列到第7列的数（只表示该行的数）

r: 
 [[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]
 [12 13 14 15 16 17]
 [18 19 20 21 22 23]
 [24 25 26 27 28 29]
 [30 31 32 33 34 35]]
r[2, 2]: 
 14
r[3, 3:6]: 
 [21 22 23]

1 r = np.arange(36).reshape((6, 6))
2 r > 30

array([[False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False,  True,  True,  True,  True,  True]])

1 # 过滤
2 print(r[r > 30])
3 
4 # 将大于30的数赋值为30
5 r[r > 30] = 30
6 print(r)

[31 32 33 34 35]
[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]
 [12 13 14 15 16 17]
 [18 19 20 21 22 23]
 [24 25 26 27 28 29]
 [30 30 30 30 30 30]]

1 # copy()操作
2 r2 = r[:3, :3]
3 print(r2)

[[ 0  1  2]
 [ 6  7  8]
 [12 13 14]]

1 # 将r2内容设置为0
2 r2[:] = 0
3 
4 # 查看r的内容
5 print(r)

[[ 0  0  0  3  4  5]
 [ 0  0  0  9 10 11]
 [ 0  0  0 15 16 17]
 [18 19 20 21 22 23]
 [24 25 26 27 28 29]
 [30 30 30 30 30 30]]

1 r3 = r.copy()
2 r3[:] = 0
3 print(r)

[[ 0  0  0  3  4  5]
 [ 0  0  0  9 10 11]
 [ 0  0  0 15 16 17]
 [18 19 20 21 22 23]
 [24 25 26 27 28 29]
 [30 30 30 30 30 30]]

4. 遍历 Array

1 import numpy as np
2 t = np.random.randint(0, 10, (4, 3))
3 print(t)

[[3 2 7]
 [4 9 1]
 [1 3 0]
 [0 9 1]]

1 for row in t:
2     print(row)

[3 2 7]
[4 9 1]
[1 3 0]
[0 9 1]

1 # 使用enumerate()
2 for i, row in enumerate(t):
3     print(\'row {} is {}\'.format(i, row))

row 0 is [3 2 7]
row 1 is [4 9 1]
row 2 is [1 3 0]
row 3 is [0 9 1]

1 t2 = t ** 2
2 print(t2)

[[ 9  4 49]
 [16 81  1]
 [ 1  9  0]
 [ 0 81  1]]

1 # 使用zip对两个array进行遍历计算
2 for i, j in zip(t, t2):
3     print(\'{} + {} = {}\'.format(i, j, i + j))

[3 2 7] + [ 9  4 49] = [12  6 56]
[4 9 1] + [16 81  1] = [20 90  2]
[1 3 0] + [1 9 0] = [ 2 12  0]
[0 9 1] + [ 0 81  1] = [ 0 90  2]