Numpy - Data Science

Numpy is the core library for scientific computing in Python. It provides a high-performance multidimensional array object, and tools for working with these arrays. If you are already familiar with MATLAB, you might find this tutorial useful to get started with Numpy. - [Official Numpy Documentations](https://numpy.org/doc/stable/user/absolute_beginners.html) - [CS231 Numpy Tutorials](https://cs231n.github.io/python-numpy-tutorial/#jupyter-and-colab-notebooks) ## Arrays **Creating** ```python import numpy as np a = np.array([1, 2, 3]) # Create a rank 1 array print(type(a)) # Prints "<class 'numpy.ndarray'>" print(a.shape) # Prints "(3,)" print(a[0], a[1], a[2]) # Prints "1 2 3" a[0] = 5 # Change an element of the array print(a) # Prints "[5, 2, 3]" x = np.array([[1, 2, 3],[2,3,4]]) print(x.shape) # (2, 3) print(x.ndim) # 2 print(x.size) # 6 print(x.reshape(3, 2)) # [[1 2] # [3 2] # [3 4]] b = np.array([[1,2,3],[4,5,6]]) # Create a rank 2 array print(b.shape) # Prints "(2, 3)" print(b[0, 0], b[0, 1], b[1, 0]) # Prints "1 2 4" ``` ```python import numpy as np a = np.zeros((2,2)) # Create an array of all zeros print(a) # Prints "[[ 0. 0.] # [ 0. 0.]]" b = np.ones((1,2)) # Create an array of all ones print(b) # Prints "[[ 1. 1.]]" c = np.full((2,2), 7) # Create a constant array print(c) # Prints "[[ 7. 7.] # [ 7. 7.]]" d = np.eye(2) # Create a 2x2 identity matrix print(d) # Prints "[[ 1. 0.] # [ 0. 1.]]" e = np.random.random((2,2)) # Create an array filled with random values print(e) # Might print "[[ 0.91940167 0.08143941] # [ 0.68744134 0.87236687]]" ``` ```python print(np.linspace(0, 10, num=5)) # [ 0. 2.5 5. 7.5 10. ] print(np.arange(2,10,2)) # [2 4 6 8] ``` ```python np.random ``` ## Array indexing **Slicing** ```python import numpy as np # Create the following rank 2 array with shape (3, 4) # [[ 1 2 3 4] # [ 5 6 7 8] # [ 9 10 11 12]] a = np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]]) # Use slicing to pull out the subarray consisting of the first 2 rows # and columns 1 and 2; b is the following array of shape (2, 2): # [[2 3] # [6 7]] b = a[:2, 1:3] # A slice of an array is a view into the same data, so modifying it # will modify the original array. print(a[0, 1]) # Prints "2" b[0, 0] = 77 # b[0, 0] is the same piece of data as a[0, 1] print(a[0, 1]) # Prints "77" ``` ```python import numpy as np # Create the following rank 2 array with shape (3, 4) # [[ 1 2 3 4] # [ 5 6 7 8] # [ 9 10 11 12]] a = np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]]) # Two ways of accessing the data in the middle row of the array. # Mixing integer indexing with slices yields an array of lower rank, # while using only slices yields an array of the same rank as the # original array: row_r1 = a[1, :] # Rank 1 view of the second row of a row_r2 = a[1:2, :] # Rank 2 view of the second row of a print(row_r1, row_r1.shape) # Prints "[5 6 7 8] (4,)" print(row_r2, row_r2.shape) # Prints "[[5 6 7 8]] (1, 4)" # We can make the same distinction when accessing columns of an array: col_r1 = a[:, 1] col_r2 = a[:, 1:2] print(col_r1, col_r1.shape) # Prints "[ 2 6 10] (3,)" print(col_r2, col_r2.shape) # Prints "[[ 2] # [ 6] # [10]] (3, 1)" ``` **Boolean array indexing** ```python import numpy as np a = np.array([[1,2], [3, 4], [5, 6]]) bool_idx = (a > 2) # Find the elements of a that are bigger than 2; # this returns a numpy array of Booleans of the same # shape as a, where each slot of bool_idx tells # whether that element of a is > 2. print(bool_idx) # Prints "[[False False] # [ True True] # [ True True]]" # We use boolean array indexing to construct a rank 1 array # consisting of the elements of a corresponding to the True values # of bool_idx print(a[bool_idx]) # Prints "[3 4 5 6]" # We can do all of the above in a single concise statement: print(a[(a > 2) & (a < 7)]) # Prints "[3 4 5 6]" ``` ## Operations ```python a = np.array([1, 2, 3, 4]) b = np.array([5, 6, 7, 8]) print(np.concatenate((a, b))) # [1 2 3 4 5 6 7 8] print(np.concatenate((x, y), axis=0)) # [[1 2] # [3 4] # [5 6] # [7 8]] ``` ```python >>> a1 = np.array([[1, 1], ... [2, 2]]) >>> a2 = np.array([[3, 3], ... [4, 4]]) >>> np.vstack((a1, a2)) array([[1, 1], [2, 2], [3, 3], [4, 4]]) >>> np.hstack((a1, a2)) array([[1, 1, 3, 3], [2, 2, 4, 4]]) ``` **flatten and ravel** ```python >>> x = np.array([[1 , 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]) >>> x.flatten() array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]) # Don't change the original array >>> a1 = x.flatten() >>> a1[0] = 99 >>> print(x) # Original array [[ 1 2 3 4] [ 5 6 7 8] [ 9 10 11 12]] >>> print(a1) # New array [99 2 3 4 5 6 7 8 9 10 11 12] # Change the original array >>> a2 = x.ravel() >>> a2[0] = 98 >>> print(x) # Original array [[98 2 3 4] [ 5 6 7 8] [ 9 10 11 12]] >>> print(a2) # New array [98 2 3 4 5 6 7 8 9 10 11 12] ``` **Datatypes** ```python import numpy as np x = np.array([1, 2]) # Let numpy choose the datatype print(x.dtype) # Prints "int64" x = np.array([1.0, 2.0]) # Let numpy choose the datatype print(x.dtype) # Prints "float64" x = np.array([1, 2], dtype=np.int64) # Force a particular datatype print(x.dtype) # Prints "int64" ``` **Array math** ```python import numpy as np x = np.array([[1,2],[3,4]], dtype=np.float64) y = np.array([[5,6],[7,8]], dtype=np.float64) # Elementwise sum; both produce the array # [[ 6.0 8.0] # [10.0 12.0]] print(x + y) print(np.add(x, y)) # Elementwise difference; both produce the array # [[-4.0 -4.0] # [-4.0 -4.0]] print(x - y) print(np.subtract(x, y)) # Elementwise product; both produce the array # [[ 5.0 12.0] # [21.0 32.0]] print(x * y) print(np.multiply(x, y)) # Elementwise division; both produce the array # [[ 0.2 0.33333333] # [ 0.42857143 0.5 ]] print(x / y) print(np.divide(x, y)) # Elementwise square root; produces the array # [[ 1. 1.41421356] # [ 1.73205081 2. ]] print(np.sqrt(x)) ``` ```python import numpy as np x = np.array([[1,2],[3,4]]) y = np.array([[5,6],[7,8]]) v = np.array([9,10]) w = np.array([11, 12]) # Inner product of vectors; both produce 219 print(v.dot(w)) print(np.dot(v, w)) # Matrix / vector product; both produce the rank 1 array [29 67] print(x.dot(v)) print(np.dot(x, v)) # Matrix / matrix product; both produce the rank 2 array # [[19 22] # [43 50]] print(x.dot(y)) print(np.dot(x, y)) ``` ```python import numpy as np x = np.array([[1,2],[3,4]]) print(np.sum(x)) # Compute sum of all elements; prints "10" print(np.sum(x, axis=0)) # Compute sum of each column; prints "[4 6]" print(np.sum(x, axis=1)) # Compute sum of each row; prints "[3 7]" ``` ```python import numpy as np x = np.array([[1,2], [3,4]]) print(x) # Prints "[[1 2] # [3 4]]" print(x.T) # Prints "[[1 3] # [2 4]]" # Note that taking the transpose of a rank 1 array does nothing: v = np.array([1,2,3]) print(v) # Prints "[1 2 3]" print(v.T) # Prints "[1 2 3]" print(np.mean(x, axis=0)) # [2. 3.] print(np.mean(x, axis=1)) # [1.5 3.5] ``` ## Save to files ```python >>> a = np.array([1, 2, 3, 4, 5, 6]) >>> np.save('filename', a) >>> b = np.load('filename.npy') >>> print(b) [1 2 3 4 5 6] >>> csv_arr = np.array([1, 2, 3, 4, 5, 6, 7, 8]) >>> np.savetxt('new_file.csv', csv_arr) >>> np.loadtxt('new_file.csv') array([1., 2., 3., 4., 5., 6., 7., 8.]) ``` ## Plot figures ```python >>> a = np.array([2, 1, 5, 7, 4, 6, 8, 14, 10, 9, 18, 20, 22]) >>> import matplotlib.pyplot as plt >>> plt.plot(a) ``` **2D** ```python >>> x = np.linspace(0, 5, 20) >>> y = np.linspace(0, 10, 20) >>> plt.plot(x, y, 'purple') # line >>> plt.plot(x, y, 'o') # dots ``` **3D** ```python import matplotlib.pyplot as plt %matplotlib inline fig = plt.figure() ax = fig.add_subplot(projection='3d') X = np.arange(-5, 5, 0.15) Y = np.arange(-5, 5, 0.15) X, Y = np.meshgrid(X, Y) R = np.sqrt(X**2 + Y**2) # Z = np.sin(R) ax.plot_surface(X, Y, R, rstride=1, cstride=1, cmap='viridis') ```