numpy 📒笔记
作为ML任务的一个 很重要 的任务,
数据处理
是十分重要的一环 而numpy向量运算方面 做了 很多优化 so 我们需要熟练掌握numpy的使用 需要提的是 numpy开数组 内存开销很大 numpy存一个int32
需要96byte
96byte96byte int32只要4byte 这24倍 我在处理一个1G的数据的时候,用list开只要1G左右 用np内存飙到110G+ 直接打停了 所以在大数据量的处理中 numpy还是值得商榷的一个工具 但熟练掌握np是必不可少的 故参考nump-100做了一个整理
This is a note for numpy-100
1. np.add.reduce() > np.sum()
* np.sum(Z) # 调用np.add.reduce()实现
* np.add.reduce(Z) # 在reshape的时候会产生一个数组
* np.logical_and.reduce(arr3[:, :-1]<arr3[:,1:], axis=1) # 逻辑与
* np.add.accumulate(Z) # reduce by processing
= np.sumsum(Z)
* np.add.reduceat(Z, np.array([1,4])) # print sum(1-3) sum(4-end)
* np.add.outer(Z, Y) # Z+Y 以Z为基础
* np.frompyfunc(add_elements, 2,1) # 构造自定义
2. Z[::-1]
* Z[1::2,::2]
3. np.nonzero([1,2,0,0,4,0])
4. 矩阵扩展
* np.pad(Z, pad_width=1, mode='constant', constant_values=0)
* np.pad(Z, pad_width=2, mode='edge')
* np.pad(Z, pad_width=2, mode='linear_ramp')
* np.pad(Z, pad_width=2, mode='maximum')
* np.pad(Z, pad_width=2, mode='middle')
* np.pad(Z, pad_width=2, mode='reflect')
* np.pad(Z, pad_width=2, mode='symmetric')
* np.pad(Z, pad_width=2, mode='wrap')
5. np.nan np.inf
6. np.diag(1+np.arange(4),k=-1)
* np.diag(np.random.random((4,4)))
7. np.unravel_index(99,(6,7,8)) # find index in shape(6,7,8)
8. 重复
* np.tile(np.array([[0,1],[1,0]]), (4,4))
* Z.repeat(3,axis=1)
9. (Z - np.mean (Z)) / (np.std (Z))
* np.random.uniform(-10,+10,10)
10. np.ubyte
11. np.array(0) // np.array(0)
* np.array(0) / np.array(0)
12. 取整
* np.copysign(np.ceil(np.abs(Z)), Z)
* np.floor() = Z - Z%1
* np.ceil()
* np.trunc()
* Z.astype(int)
13. def generate():
for x in range(10):
yield x
Z = np.fromiter(generate(),dtype=float,count=-1)
14. np.linspace(0,1,11,endpoint=False)
15. equal
* np.array_equal(A, B)
* np.allclose(1e-8,1e-9)
* np.allclose(A,B) # tolerance
16. Z.flags.writeable = False
17. Z.argmax()
18. Z['x'], Z['y'] = np.meshgrid(np.linspace(0,1,5),
np.linspace(0,1,5))
* Z['X'] # 横向uniform Z['y']纵向uniform
19. Array的合并
* np.concatenate([arr1,arr2], axis=0)
= np.vstack((arr1,arr2))
= np.row_stack((arr1,arr2))
= np.r_[arr1, arr2]
* np.concatenate([arr1,arr2], axis=1)
= np.hstack((arr1,arr2))
= np.column_stack((arr1,arr2))
= np.c_[arr1, arr2]
* np.concatenate([arr1,arr2], axis=2) # 如果没有第三维度的时候报错
= np.dstack((arr1,arr2)) # No Error
20. Array 拆分
* np.split(arr3, [1,3],axis=0)
= np.vsplit(arr3, [1,3])
* np.split(arr3, [1,3],axis=1)
= np.hsplit(arr3, [1,3])
* np.split(arr3, [1,3],axis=2)
= np.dsplit(arr3, [1,3])
* np.setdiff1d(A, B, assume_unique=True)# 差集
21. np.take(arr,indexs)
22. arr_3d = arr[:, np.newaxis, :] # 拓展维数
23. sort
* np.sort(Z)
* np.argsort(Z)
* np.lexsort(Z)
* np.argsort(Z, kind='mergesort',order='index')
* np.argsort(Z, kind='quicksort')
* np.argsort(Z, kind='heapsort')
* np.searchsort(Z, v)
* np.partition(list,3) # 3 is the index of wait partition number
* sort(Z)
24. np.linalg.det(C) # 行列式值
25. np.set_printoptions(threshold=np.nan) # Array不省略
26. np.atleast_1d(arr) np.atleast_2d(arr) np.atleast_3d(arr) # 拓宽n维
27. np.genfromtxt(s, delimiter=",", dtype=np.int) # get data from file
28. np.ndenumerate()
29. (~Z.any(axis=0)).any() # 检测是否有null
30. 降维
* Z.flatten() # 降维 Z.flatten('F')列
* Z.flat[index] # 返回降维后第index个元素
* np.ravel(Z) # np.ravel(Z, order='F')
* np.resize(Z,(2,4)) # 重组
* np.reshape((2,4)) # 重组
31. 按索引叠加
* np.bincount(I, minlength=len(Z)) # 按值计数
* np.add.at(Z,I,1)
* np.bincount(I, X) # 以I为基 统计count
32.diagonal of dot
* np.diag(A @ B) # slow
= np.sum(A * B.T, axis=1) # fast
= np.einsum("ij,ji->i", A, B) # fastest
* np.einsum('i->', a) # sum
* np.einsum('i, i->i', a, b) # 对应乘 Hadamard
* np.einsum('i, i', a, b) # 内积 np.inner(a, b)
* np.einsum('i, j->ij', a, b) # 外积 np.outer(a, b)
* np.einsum('ji', A) # 转置
* np.einsum('ii->i', A) # np.diag(A)
* np.einsum('ii', A) # np.trace(A)
* np.einsum('ij->', A) # np.sum(A)
* np.einsum('ij->j', A) # np.sum(A, axis=0)
* np.einsum('ij->i', A) # np.sum(A, axis=1)
* np.einsum('ij, ij->ij', A, B)# A*B
* np.einsum('ij, ji->ij', A, B)# A*B.T
* np.einsum('ij, jk', A, B) # np.dot(A, B)
* np.einsum('ij, ij', A, B) # 内积
33. A[[0,1]] = A[[1,0]] # 交换两行
34. Z0 = np.zeros(len(Z) + (len(Z)-1)*(nz))
Z0[::nz+1] = Z # 插入0
35. np.repeat(np.arange(len(C)), C) # 与np.bincount(A)互逆
36. np.roll(faces.repeat(2,axis=1),-1,axis=1) # 滚动-1
37. np.logical_not(Z, out=Z) # Boolean 取反
np.negative(Z, out=Z) # 符号反
38. poiner to line distance
def distance(P0, P1, p):
T = P1 - P0
L = (T**2).sum(axis=1)
U = -((P0[:,0]-p[...,0])*T[:,0] + (P0[:,1]-p[...,1])*T[:,1]) / L
U = U.reshape(len(U),1)
D = P0 + U*T - p
return np.sqrt((D**2).sum(axis=1))
39.秩
U, S, V = np.linalg.svd(Z)
rank = np.sum(S > 1e-10)
40. Z.strides # 跨度 在某个维度上获得下一个值所需要跨过的字节
41. n largest
* Z[np.argsort(Z)[-n:]] # slow
* Z[np.argpartition(-Z,n)[:n]] # fast
42. 笛卡尔积
def cartesian(arrays):
arrays = [np.asarray(a) for a in arrays]
shape = (len(x) for x in arrays)
ix = np.indices(shape, dtype=int)
ix = ix.reshape(len(arrays), -1).T
for n, arr in enumerate(arrays):
ix[:, n] = arrays[n][ix[:, n]]
return ix
43. np.unpackbits(Z, axis=1) # 转成2进制
44. np.interp(a, A, B) # 插值
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
You can use this BibTex to reference this blog if you find it useful and want to quote it.