!pwd
import numpy as np

/private/var/folders/k8/zfp7dvcs1m326gz1brql1tv80000gn/T/tmp.7gVLKvoQ


l = list(range(100))
print(type(l))
print(l)

<class 'list'>
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]


a = np.array(l)
print(type(a))
print(a)

<class 'numpy.ndarray'>
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
 96 97 98 99]


# 2d array
b = np.array(
    [
        [1.0, 2.0, 3.0],
        [4.0, 5.0, 6.0],
    ],
)

print(b)

[[1. 2. 3.]
 [4. 5. 6.]]


print(a.shape, a.dtype)

(100,) int64


print(b.shape, b.dtype)

(2, 3) float64


b2 = np.array(
    [
        [1, 2, 3],
        [4, 5, 6],
    ],
    dtype=np.float16,
)
print(b2)
print(b2.dtype)

[[1. 2. 3.]
 [4. 5. 6.]]
float16


# reading
print(b[1, 2])

6.0


# writing
b[1, 2] = 333.0
print(b)

[[  1.   2.   3.]
 [  4.   5. 333.]]


# extract column
b[:, 1]

array([2., 5.])


# extract row:
b[1, :]

array([  4.,   5., 333.])


print(2 * b + 1)

[[  3.   5.   7.]
 [  9.  11. 667.]]


# element wise:
print(b + b)

[[  2.   4.   6.]
 [  8.  10. 666.]]


print(b * b)

[[1.00000e+00 4.00000e+00 9.00000e+00]
 [1.60000e+01 2.50000e+01 1.10889e+05]]


m = np.array(
    [
        [1, 2, 3],
        [2, 3, 4],
        [4, 5, 6],
    ]
)
x = np.array(
    [1, 1, 1],
)


print(m @ x)

[ 6  9 15]


print(m @ m)

[[17 23 29]
 [24 33 42]
 [38 53 68]]


# from 0 to 7 (excluded)
x = np.arange(7)
print(x)

[0 1 2 3 4 5 6]


# from 3 to 7 (excluded)
y = np.arange(3, 7)
print(y)

[3 4 5 6]


# from 7 down to 3 (excluded)
# by using negative step size:
z = np.arange(7, 3, -1)
print(z)

[7 6 5 4]


# from 1 to 2 (included!), result has 11 points:
print(np.linspace(1, 2, 11))

[1.  1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2. ]


# 3x4 matrix
np.ones((3, 4))

array([[1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.]])


# 2x3 matrix
np.zeros((2, 3))

array([[0., 0., 0.],
       [0., 0., 0.]])


# 4x4 identity matrix
ii = np.eye(4)
print(ii)

[[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]


np.sin(b)

array([[ 0.84147098,  0.90929743,  0.14112001],
       [-0.7568025 , -0.95892427, -0.00882117]])


print(b)

[[  1.   2.   3.]
 [  4.   5. 333.]]


print(b[b % 2 != 0])

[  1.   3.   5. 333.]


print(b % 2 != 0)

[[ True False  True]
 [False  True  True]]


print(b[b % 2 != 0])

[  1.   3.   5. 333.]


b[b % 2 != 0] = 17.0


print(b)

[[17.  2. 17.]
 [ 4. 17. 17.]]


a = np.arange(1, 4)
b = 100 * np.eye(3)


print(a.shape, b.shape)

(3,) (3, 3)

a

array([1, 2, 3])

b

array([[100.,   0.,   0.],
       [  0., 100.,   0.],
       [  0.,   0., 100.]])


a + b

array([[101.,   2.,   3.],
       [  1., 102.,   3.],
       [  1.,   2., 103.]])


# shape (3,) to (3, 1)
a[:, None].shape

(3, 1)


# now we add column-wise:
a[:, None] + b

array([[101.,   1.,   1.],
       [  2., 102.,   2.],
       [  3.,   3., 103.]])


points = 100 * np.random.random(size=(100, 3))
print(points.shape)

(100, 3)


print(points[:4])

[[68.52935386 71.11285728 57.15705092]
 [61.74102221 51.73097121 66.62028454]
 [52.58186402 85.71699966 38.49240055]
 [35.89422478 79.29291599 84.74216476]]


query_point = 100 * np.random.random(size=(3,))
print(query_point)

[50.67751488 53.24989981  3.75965148]


pairwise_diff = points - query_point
print(pairwise_diff.shape)

(100, 3)


distances_squared = np.sum(pairwise_diff**2, axis=1)
print(distances_squared.shape)

(100,)


min_distance = np.min(distances_squared) ** 0.5
print(min_distance)

11.529689222318954


i = np.argmin(distances_squared)
print(i)

15


closest_point = points[i, :]
print(closest_point)

[61.67676727 55.9303434   1.57667126]


# reshape

numbers = np.arange(24)
matrix = numbers.reshape(4, 6)
print(matrix)

[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]
 [12 13 14 15 16 17]
 [18 19 20 21 22 23]]


# we don't need to compute the last dimension, since
# matrix has 24 entries which only leaves 4 for the last
# dimension, the -1 can also be used at other positions:
cube = matrix.reshape(2, 3, -1)
print(cube)

[[[ 0  1  2  3]
  [ 4  5  6  7]
  [ 8  9 10 11]]

 [[12 13 14 15]
  [16 17 18 19]
  [20 21 22 23]]]


# transpose a matrix:
print(matrix.T)

[[ 0  6 12 18]
 [ 1  7 13 19]
 [ 2  8 14 20]
 [ 3  9 15 21]
 [ 4 10 16 22]
 [ 5 11 17 23]]


# stack vectors to a matrix:
v1 = np.arange(3)
v2 = v1 + 3
v3 = v1 + 10

v4 = np.hstack((v1, v2, v3))
print(v4)

[ 0  1  2  3  4  5 10 11 12]


m = np.vstack((v1, v2, v3))
print(m)

[[ 0  1  2]
 [ 3  4  5]
 [10 11 12]]


# array to list:
print(m.tolist())

[[0, 1, 2], [3, 4, 5], [10, 11, 12]]


# flatten matrix:
print(m.flatten())

[ 0  1  2  3  4  5 10 11 12]


# create your own vectorized function:


def transform(x):
    if x <= 1:
        return 0
    return x**0.5


# otypes is the return type, else we will cast to the dtype
# of the input array:
vectorized = np.vectorize(transform, otypes=(np.float64,))
print(vectorized(m))

[[0.         0.         1.41421356]
 [1.73205081 2.         2.23606798]
 [3.16227766 3.31662479 3.46410162]]


# import conventin
import pandas as pd


# Inline, sharp plots on jupyter notebooks on Mac:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'


!head -n 6 data/brain_size.csv

"";"Gender";"FSIQ";"VIQ";"PIQ";"Weight";"Height";"MRI_Count"
"1";"Female";133;132;124;"118";"64.5";816932
"2";"Male";140;150;124;".";"72.5";1001121
"3";"Male";139;123;150;"143";"73.3";1038437
"4";"Male";133;129;128;"172";"68.8";965353
"5";"Female";137;132;134;"147";"65.0";951545


df = pd.read_csv("data/brain_size.csv", sep=";", na_values=".", index_col=0)

df.head(5)


print(df.shape)

(40, 7)


print(df.columns)

Index(['Gender', 'FSIQ', 'VIQ', 'PIQ', 'Weight', 'Height', 'MRI_Count'], dtype='object')


# pandas auto-detects types from the file:
print(df.dtypes)

Gender        object
FSIQ           int64
VIQ            int64
PIQ            int64
Weight       float64
Height       float64
MRI_Count      int64
dtype: object


print(df.index)

Int64Index([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
            18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
            35, 36, 37, 38, 39, 40],
           dtype='int64')


df.describe()


# integer based indexing:
print(df['Gender'].iloc[:3])

1    Female
2      Male
3      Male
Name: Gender, dtype: object


# access columns using a list of strings:
df[["Gender", "Weight"]].head(5)


# combined indexing
df.iloc[:3][["Gender", "Weight"]]


df[["Gender", "Weight"]].iloc[:3]


df.iloc[:3, [0, 4]]


df.loc[:3, ["Gender", "Weight"]]


# Fancy indexing:
df.loc[df.Weight > 180, ["Gender", "Weight"]]


# pure filteing, all columns, no `.loc` needed here:
df[(df.Weight > 150) & (df.Gender == "Male")]


groups = df.groupby("Gender")
for key, group in groups:
    print(key)
    print(group.head(5))
    print()

Female
   Gender  FSIQ  VIQ  PIQ  Weight  Height  MRI_Count
1  Female   133  132  124   118.0    64.5     816932
5  Female   137  132  134   147.0    65.0     951545
6  Female    99   90  110   146.0    69.0     928799
7  Female   138  136  131   138.0    64.5     991305
8  Female    92   90   98   175.0    66.0     854258

Male
   Gender  FSIQ  VIQ  PIQ  Weight  Height  MRI_Count
2    Male   140  150  124     NaN    72.5    1001121
3    Male   139  123  150   143.0    73.3    1038437
4    Male   133  129  128   172.0    68.8     965353
9    Male    89   93   84   134.0    66.3     904858
10   Male   133  114  147   172.0    68.8     955466


df.groupby("Gender")["Weight"].mean()

Gender
Female    137.200000
Male      166.444444
Name: Weight, dtype: float64


def is_tall(h):
    # 75 inches is ~190 cm 
    return h > 75

df["Is_Large"] = df.Height.apply(is_tall)
print(df.head())

   Gender  FSIQ  VIQ  PIQ  Weight  Height  MRI_Count  Is_Large
1  Female   133  132  124   118.0    64.5     816932     False
2    Male   140  150  124     NaN    72.5    1001121     False
3    Male   139  123  150   143.0    73.3    1038437     False
4    Male   133  129  128   172.0    68.8     965353     False
5  Female   137  132  134   147.0    65.0     951545     False


for key, group in df.groupby(["Gender", "Is_Large"]):
    print(key)
    print(group.describe())
    print()

('Female', False)
             FSIQ         VIQ         PIQ      Weight     Height     MRI_Count
count   20.000000   20.000000   20.000000   20.000000  20.000000      20.00000
mean   111.900000  109.450000  110.450000  137.200000  65.765000  862654.60000
std     23.686327   21.670924   21.946046   16.953807   2.288248   55893.55578
min     77.000000   71.000000   72.000000  106.000000  62.000000  790619.00000
25%     90.250000   90.000000   93.000000  125.750000  64.500000  828062.00000
50%    115.500000  116.000000  115.000000  138.500000  66.000000  855365.00000
75%    133.000000  129.000000  128.750000  146.250000  66.875000  882668.50000
max    140.000000  136.000000  147.000000  175.000000  70.500000  991305.00000

('Male', False)
             FSIQ        VIQ         PIQ      Weight    Height     MRI_Count
count   16.000000   16.00000   16.000000   14.000000  15.00000  1.600000e+01
mean   119.250000  119.68750  114.937500  160.857143  70.18000  9.496824e+05
std     26.185238   26.80726   24.593953   19.174732   2.40125  5.340396e+04
min     80.000000   77.00000   74.000000  132.000000  66.30000  8.799870e+05
25%     89.750000   95.25000   86.000000  145.000000  68.80000  9.192588e+05
50%    134.000000  126.00000  124.000000  163.000000  70.00000  9.472415e+05
75%    140.000000  145.00000  128.750000  176.500000  72.25000  9.579378e+05
max    144.000000  150.00000  150.000000  191.000000  74.00000  1.079549e+06

('Male', True)
            FSIQ         VIQ         PIQ      Weight  Height     MRI_Count
count    4.00000    4.000000    4.000000    4.000000   4.000  4.000000e+00
mean    98.00000   97.500000   98.250000  186.000000  76.125  9.755475e+05
std      6.63325    6.757712   13.720423    5.354126   0.750  6.946209e+04
min     89.00000   91.000000   84.000000  179.000000  75.500  9.059400e+05
25%     95.00000   94.750000   87.750000  184.250000  75.500  9.283822e+05
50%    100.00000   96.000000   99.500000  186.500000  76.000  9.668940e+05
75%    103.00000   98.750000  110.000000  188.250000  76.625  1.014059e+06
max    103.00000  107.000000  110.000000  192.000000  77.000  1.062462e+06


pd.pivot_table(df, index=["Gender", "Is_Large"])


df.boxplot(column=["Weight", "Height"])

<AxesSubplot: >


df.groupby("Gender").boxplot(column=["Weight", "Height"])

Female         AxesSubplot(0.1,0.15;0.363636x0.75)
Male      AxesSubplot(0.536364,0.15;0.363636x0.75)
dtype: object


import numpy as np


def f(r):
    return (
        r**2 * np.arccos(r / 2)
        + np.arccos(1 - 0.5 * r**2)
        - 0.5 * r * np.sqrt(4 - r**2)
        - np.pi / 2
    )


import scipy.optimize


# find zero of f(r) starting with guess r=0.5:
scipy.optimize.fsolve(f, 0.5)

array([1.15872847])


from matplotlib import pyplot as plt


r = np.linspace(0, 2, 100)


print(r.min(), r.max(), r.shape)

0.0 2.0 (100,)


plt.plot(r, f(r))
plt.plot([0, 2], [0, 0], ":")
plt.grid()


# adaptive numerical integration:
from scipy.integrate import quad


def integrand(t, r):
    return np.sqrt(r**2 - t**2) + np.sqrt(1 - t**2) - 1


def f(r):
    integral, error_estimate = quad(
        integrand, 0, np.sqrt(r**2 - r**4 / 4), args=(r,)
    )
    return integral - np.pi / 4


# find zero of f(r) starting with guess r=0.5:
scipy.optimize.fsolve(f, 0.5)

array([1.15872847])

Scientific Python Introduction¶

numpy¶

Arrays¶

Array creation methods¶

Vectorized functions¶

Filtering / Fancy indexing¶

Broadcasting¶

Example¶

Other helpful functions¶

Pandas¶

data.frame basics¶

Indexing¶

Grouping¶

Computing new colums¶

Plotting¶

Performance of pandas¶

Not covered¶

Solving the goat problem with scipy*¶

	Gender	FSIQ	VIQ	PIQ	Weight	Height	MRI_Count
1	Female	133	132	124	118.0	64.5	816932
2	Male	140	150	124	NaN	72.5	1001121
3	Male	139	123	150	143.0	73.3	1038437
4	Male	133	129	128	172.0	68.8	965353
5	Female	137	132	134	147.0	65.0	951545

	FSIQ	VIQ	PIQ	Weight	Height	MRI_Count
count	40.000000	40.000000	40.00000	38.000000	39.000000	4.000000e+01
mean	113.450000	112.350000	111.02500	151.052632	68.525641	9.087550e+05
std	24.082071	23.616107	22.47105	23.478509	3.994649	7.228205e+04
min	77.000000	71.000000	72.00000	106.000000	62.000000	7.906190e+05
25%	89.750000	90.000000	88.25000	135.250000	66.000000	8.559185e+05
50%	116.500000	113.000000	115.00000	146.500000	68.000000	9.053990e+05
75%	135.500000	129.750000	128.00000	172.000000	70.500000	9.500780e+05
max	144.000000	150.000000	150.00000	192.000000	77.000000	1.079549e+06

		FSIQ	Height	MRI_Count	PIQ	VIQ	Weight
Gender	Is_Large
Female	False	111.90	65.765	862654.600	110.4500	109.4500	137.200000
Male	False	119.25	70.180	949682.375	114.9375	119.6875	160.857143
Male	True	98.00	76.125	975547.500	98.2500	97.5000	186.000000