# Copyright (c) Thalesians Ltd, 2019-2023. All rights reserved.
# Copyright (c) Paul Alexander Bilokon, 2019-2023. All rights reserved.
# Author: Paul Alexander Bilokon <[email protected]>
# This version: 2.0 (2023.11.17)
# Previous versions: 1.0 (2019.03.25)
# Email: [email protected]


import pandas as pd


df = pd.DataFrame(
    {
        'transaction date': [2012.917, 2012.917, 2013.583, 2013.500, 2012.833, 2012.667, 2012.667, 2013.417, 2013.500, 2013.417, 2013.083, 2013.333, 2012.917, 2012.667, 2013.500],
        'distance to the nearest MRT station': [84.87882, 306.59470, 561.98450, 561.98450, 390.56840, 2175.03000, 623.47310, 287.60250, 5512.03800, 1783.18000, 405.21340, 90.45606, 492.23130, 2469.64500, 1164.83800],
        'number of convenience stores': [10, 9, 5, 5, 5, 3, 7, 6, 1, 3, 1, 9, 5, 4, 4],
        'latitude': [24.98298, 24.98034, 24.98746, 24.98746, 24.97937, 24.96305, 24.97933, 24.98042, 24.95095, 24.96731, 24.97349, 24.97433, 24.96515, 24.96108, 24.99156],
        'longitude': [121.54024, 121.53951, 121.54391, 121.54391, 121.54245, 121.51254, 121.53642, 121.54228, 121.48458, 121.51486, 121.53372, 121.54310, 121.53737, 121.51046, 121.53406],
        'house price per unit area': [37.9, 42.2, 47.3, 54.8, 43.1, 32.1, 40.3, 46.7, 18.8, 22.1, 41.4, 58.1, 39.3, 23.8, 34.3]
    },
    columns=[
        'transaction date',
        'distance to the nearest MRT station',
        'number of convenience stores',
        'latitude',
        'longitude',
        'house price per unit area'
    ])

df


df.head()


df.head(10)


df.tail()


list(df.index)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]


list(df.columns)

['transaction date',
 'distance to the nearest MRT station',
 'number of convenience stores',
 'latitude',
 'longitude',
 'house price per unit area']


df.loc[3]

transaction date                       2013.50000
distance to the nearest MRT station     561.98450
number of convenience stores              5.00000
latitude                                 24.98746
longitude                               121.54391
house price per unit area                54.80000
Name: 3, dtype: float64


list(df.index)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]


df.index = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O']


df.head()


df.loc['D']

transaction date                       2013.50000
distance to the nearest MRT station     561.98450
number of convenience stores              5.00000
latitude                                 24.98746
longitude                               121.54391
house price per unit area                54.80000
Name: D, dtype: float64


df.iloc[3]

transaction date                       2013.50000
distance to the nearest MRT station     561.98450
number of convenience stores              5.00000
latitude                                 24.98746
longitude                               121.54391
house price per unit area                54.80000
Name: D, dtype: float64


type(df)

pandas.core.frame.DataFrame


type(df.loc['D'])

pandas.core.series.Series


list(df.columns)

['transaction date',
 'distance to the nearest MRT station',
 'number of convenience stores',
 'latitude',
 'longitude',
 'house price per unit area']


df['latitude']

A    24.98298
B    24.98034
C    24.98746
D    24.98746
E    24.97937
F    24.96305
G    24.97933
H    24.98042
I    24.95095
J    24.96731
K    24.97349
L    24.97433
M    24.96515
N    24.96108
O    24.99156
Name: latitude, dtype: float64


type(df['latitude'])

pandas.core.series.Series


df[['latitude', 'longitude']]


type(df[['latitude', 'longitude']])

pandas.core.frame.DataFrame


firsts = {}
for c in df.columns:
    firsts[c] = df[c].iloc[0]


firsts

{'transaction date': 2012.917,
 'distance to the nearest MRT station': 84.87882,
 'number of convenience stores': 10,
 'latitude': 24.98298,
 'longitude': 121.54024,
 'house price per unit area': 37.9}


df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15 entries, A to O
Data columns (total 6 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   transaction date                     15 non-null     float64
 1   distance to the nearest MRT station  15 non-null     float64
 2   number of convenience stores         15 non-null     int64  
 3   latitude                             15 non-null     float64
 4   longitude                            15 non-null     float64
 5   house price per unit area            15 non-null     float64
dtypes: float64(5), int64(1)
memory usage: 1.4+ KB


df.describe()


df['house age'] = [32.0, 19.5, 13.3, 13.3, 5.0, 7.1, 34.5, 20.3, 31.7, 17.9, 34.8, 6.3, 13.0, 20.4, 13.2]


df.head()


df['house age'] = [320., 195., 133., 133., 50., 71., 345., 203., 317., 179., 348., 63., 130., 204., 132.]


df.head()


df = df[[
    'transaction date', 'house age',
    'distance to the nearest MRT station', 'number of convenience stores',
    'latitude', 'longitude',
    'house price per unit area'
]]


df.head()


del df['house age']


df.head()


import math


df['distance to the nearest MRT station'].apply(math.sqrt)

A     9.212970
B    17.509846
C    23.706212
D    23.706212
E    19.762803
F    46.637217
G    24.969443
H    16.958847
I    74.243101
J    42.227716
K    20.129913
L     9.510839
M    22.186286
N    49.695523
O    34.129723
Name: distance to the nearest MRT station, dtype: float64


df.head()


df['distance to the nearest MRT station'] = df['distance to the nearest MRT station'].apply(math.sqrt)

C:\Users\paul\AppData\Local\Temp\ipykernel_22944\4176019659.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['distance to the nearest MRT station'] = df['distance to the nearest MRT station'].apply(math.sqrt)


df.head()


df['distance to the nearest MRT station'] = df['distance to the nearest MRT station'].apply(lambda x: x * x)

C:\Users\paul\AppData\Local\Temp\ipykernel_22944\2618952524.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['distance to the nearest MRT station'] = df['distance to the nearest MRT station'].apply(lambda x: x * x)


df.head()


df['number of convenience stores'] >= 7

A     True
B     True
C    False
D    False
E    False
F    False
G     True
H    False
I    False
J    False
K    False
L     True
M    False
N    False
O    False
Name: number of convenience stores, dtype: bool


df[df['number of convenience stores'] >= 7]


(df['number of convenience stores'] >= 7) | (df['distance to the nearest MRT station'] < 500)

A     True
B     True
C    False
D    False
E     True
F    False
G     True
H     True
I    False
J    False
K     True
L     True
M     True
N    False
O    False
dtype: bool


df[(df['number of convenience stores'] >= 7) | (df['distance to the nearest MRT station'] < 500)]


(df['number of convenience stores'] >= 7) & (df['distance to the nearest MRT station'] < 500)

A     True
B     True
C    False
D    False
E    False
F    False
G    False
H    False
I    False
J    False
K    False
L     True
M    False
N    False
O    False
dtype: bool


df[(df['number of convenience stores'] >= 7) & (df['distance to the nearest MRT station'] < 500)]


df1 = df[(df['number of convenience stores'] >= 7) & (df['distance to the nearest MRT station'] < 500)]


df1.loc['L', 'number of convenience stores'] = 100

df1

df


df_copy = df.copy()


import numpy as np
df_copy.loc['C', 'distance to the nearest MRT station'] = np.nan
df_copy.loc['E', 'number of convenience stores'] = np.nan


df_copy


df_copy.dropna()


df_copy


df_copy = df_copy.dropna()


df_copy


df.groupby('number of convenience stores').mean()


df.groupby('number of convenience stores').min()


df.groupby('number of convenience stores').max()

df


df1 = pd.DataFrame({
        'transaction date': [2013.417, 2013.083, 2012.917],
        'distance to the nearest MRT station': [378.90278, 90.23891, 489.32891],
        'number of convenience stores': [5, 6, 7],
        'latitude': [24.97432, 24.97435, 24.97428],
        'longitude': [121.53290, 121.53290, 121.53390],
        'house price per unit area': [33.2, 82.0, 32.1]
    },
    index = ['P', 'Q', 'R'],
    columns = ['transaction date', 'distance to the nearest MRT station', 'number of convenience stores',
              'latitude', 'longitude', 'house price per unit area']
)

df1


pd.concat([df, df1])

df


df_comments = pd.DataFrame({
        'comments': ['data to be validated', 'to be confirmed'],
    },
    index = ['H', 'J']
)


df_comments


joined_df = df.join(df_comments)
joined_df


joined_df.loc[joined_df['comments'].isnull(), 'comments'] = ''


joined_df


eg_df = pd.DataFrame({
        'date': ['2019-09-01', '2019-09-02', '2019-09-03', '2019-09-04', '2019-09-05', '2019-09-06', '2019-09-07'],
        'value': [3.78, 2.90, 3.29, 1.21, 3.20, 9.39, 8.90]
    },
    columns=['date', 'value'])
eg_df


eg_df = pd.DataFrame({
        'date': ['2019-09-01', '2019-09-02', '2019-09-03', '2019-09-04', '2019-09-05', '2019-09-06', '2019-09-07'],
        'value': [3.78, 2.90, 3.29, 1.21, 3.20, 9.39, 8.90]
    },
    columns=['date', 'value'])


eg_df = pd.DataFrame({
        'date': ['2019-09-01', '2019-09-02', '2019-09-03', '2019-09-04', '2019-09-05', '2019-09-06', '2019-09-07'],
        'value': [3.78, 2.90, 3.29, 1.21, 3.20, 9.39, 8.90]
    },
    columns=['date', 'value'])
eg_df


eg_df = pd.DataFrame({
        'date': ['2019-09-01', '2019-09-02', '2019-09-03', '2019-09-04', '2019-09-05', '2019-09-06', '2019-09-07'],
        'value': [3.78, 2.90, 3.29, 1.21, 3.20, 9.39, 8.90]
    },
    columns=['date', 'value'])
eg_df


import datetime as dt
dt.date(2019, 9, 2).weekday()

0


import numpy as np


df.values

array([[2.0129170e+03, 8.4878820e+01, 1.0000000e+01, 2.4982980e+01,
        1.2154024e+02, 3.7900000e+01],
       [2.0129170e+03, 3.0659470e+02, 9.0000000e+00, 2.4980340e+01,
        1.2153951e+02, 4.2200000e+01],
       [2.0135830e+03, 5.6198450e+02, 5.0000000e+00, 2.4987460e+01,
        1.2154391e+02, 4.7300000e+01],
       [2.0135000e+03, 5.6198450e+02, 5.0000000e+00, 2.4987460e+01,
        1.2154391e+02, 5.4800000e+01],
       [2.0128330e+03, 3.9056840e+02, 5.0000000e+00, 2.4979370e+01,
        1.2154245e+02, 4.3100000e+01],
       [2.0126670e+03, 2.1750300e+03, 3.0000000e+00, 2.4963050e+01,
        1.2151254e+02, 3.2100000e+01],
       [2.0126670e+03, 6.2347310e+02, 7.0000000e+00, 2.4979330e+01,
        1.2153642e+02, 4.0300000e+01],
       [2.0134170e+03, 2.8760250e+02, 6.0000000e+00, 2.4980420e+01,
        1.2154228e+02, 4.6700000e+01],
       [2.0135000e+03, 5.5120380e+03, 1.0000000e+00, 2.4950950e+01,
        1.2148458e+02, 1.8800000e+01],
       [2.0134170e+03, 1.7831800e+03, 3.0000000e+00, 2.4967310e+01,
        1.2151486e+02, 2.2100000e+01],
       [2.0130830e+03, 4.0521340e+02, 1.0000000e+00, 2.4973490e+01,
        1.2153372e+02, 4.1400000e+01],
       [2.0133330e+03, 9.0456060e+01, 9.0000000e+00, 2.4974330e+01,
        1.2154310e+02, 5.8100000e+01],
       [2.0129170e+03, 4.9223130e+02, 5.0000000e+00, 2.4965150e+01,
        1.2153737e+02, 3.9300000e+01],
       [2.0126670e+03, 2.4696450e+03, 4.0000000e+00, 2.4961080e+01,
        1.2151046e+02, 2.3800000e+01],
       [2.0135000e+03, 1.1648380e+03, 4.0000000e+00, 2.4991560e+01,
        1.2153406e+02, 3.4300000e+01]])


df['transaction date'].values

array([2012.917, 2012.917, 2013.583, 2013.5  , 2012.833, 2012.667,
       2012.667, 2013.417, 2013.5  , 2013.417, 2013.083, 2013.333,
       2012.917, 2012.667, 2013.5  ])


df.loc['A'].values

array([2012.917  ,   84.87882,   10.     ,   24.98298,  121.54024,
         37.9    ])


a = np.array([3.57, 4.18, 25.7])
a

array([ 3.57,  4.18, 25.7 ])


np.ndim(a)

1


np.size(a)

3


np.shape(a)

(3,)


np.linspace(-5., 10., 15)

array([-5.        , -3.92857143, -2.85714286, -1.78571429, -0.71428571,
        0.35714286,  1.42857143,  2.5       ,  3.57142857,  4.64285714,
        5.71428571,  6.78571429,  7.85714286,  8.92857143, 10.        ])


list(range(-5, 10, 1))

[-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


np.arange(-5., 10., 1.25)

array([-5.  , -3.75, -2.5 , -1.25,  0.  ,  1.25,  2.5 ,  3.75,  5.  ,
        6.25,  7.5 ,  8.75])


np.arange(-5., 10., 1.)

array([-5., -4., -3., -2., -1.,  0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,
        8.,  9.])


xs = np.linspace(0., 99. * math.pi / 8., 100)
ys = np.sin(xs)


%matplotlib inline


import matplotlib.pyplot as plt


plt.plot(xs, ys);


xs = np.logspace(start=0., stop=5., num=15, base=10.)
xs

array([1.00000000e+00, 2.27584593e+00, 5.17947468e+00, 1.17876863e+01,
       2.68269580e+01, 6.10540230e+01, 1.38949549e+02, 3.16227766e+02,
       7.19685673e+02, 1.63789371e+03, 3.72759372e+03, 8.48342898e+03,
       1.93069773e+04, 4.39397056e+04, 1.00000000e+05])


ys = [x**2 for x in xs]


plt.plot(xs, ys);


import math


xs = [n * math.pi / 8. for n in range(10000)]
ys = [math.sin(x) for x in xs]


%timeit -o xs = [n * math.pi / 8. for n in range(10000)]

1.57 ms Â± 167 Âµs per loop (mean Â± std. dev. of 7 runs, 1,000 loops each)

<TimeitResult : 1.57 ms Â± 167 Âµs per loop (mean Â± std. dev. of 7 runs, 1,000 loops each)>


%timeit -o ys = [math.sin(x) for x in xs]

1.66 ms Â± 60.4 Âµs per loop (mean Â± std. dev. of 7 runs, 1,000 loops each)

<TimeitResult : 1.66 ms Â± 60.4 Âµs per loop (mean Â± std. dev. of 7 runs, 1,000 loops each)>


xs = np.linspace(0., 9999. * math.pi / 8., 10000)
ys = np.sin(xs)


%timeit -o xs = np.linspace(0., 9999. * math.pi / 8., 10000)

29.4 Âµs Â± 2.24 Âµs per loop (mean Â± std. dev. of 7 runs, 10,000 loops each)

<TimeitResult : 29.4 Âµs Â± 2.24 Âµs per loop (mean Â± std. dev. of 7 runs, 10,000 loops each)>


%timeit -o ys = np.sin(xs)

130 Âµs Â± 17 Âµs per loop (mean Â± std. dev. of 7 runs, 10,000 loops each)

<TimeitResult : 130 Âµs Â± 17 Âµs per loop (mean Â± std. dev. of 7 runs, 10,000 loops each)>


my_matrix = np.array([[4.28, 3.23, 5.87], [1.23, 5.32, 3.33]])
my_matrix

array([[4.28, 3.23, 5.87],
       [1.23, 5.32, 3.33]])


np.ndim(my_matrix)

2


np.size(my_matrix)

6


np.shape(my_matrix)

(2, 3)


np.zeros((3,))

array([0., 0., 0.])


np.zeros((3, 5))

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])


np.zeros((3, 5), dtype='float32')

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]], dtype=float32)


np.ones((3,))

array([1., 1., 1.])


np.ones((3, 5))

array([[1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]])


np.ones((3, 5)) * 7.5

array([[7.5, 7.5, 7.5, 7.5, 7.5],
       [7.5, 7.5, 7.5, 7.5, 7.5],
       [7.5, 7.5, 7.5, 7.5, 7.5]])


np.full((3, 5), 7.5)

array([[7.5, 7.5, 7.5, 7.5, 7.5],
       [7.5, 7.5, 7.5, 7.5, 7.5],
       [7.5, 7.5, 7.5, 7.5, 7.5]])


np.empty((3, 10))

array([[1.16488816e-311, 1.01283457e-321, 0.00000000e+000,
        0.00000000e+000, 1.31423071e-173, 1.16097020e-028,
        8.26330279e-072, 2.29624047e-028, 6.74350499e-067,
        1.04917183e-153],
       [5.23081515e-143, 3.54328359e-061, 3.53479517e-057,
        1.43792249e-051, 4.75868580e-038, 1.70098498e+256,
        5.49109388e-143, 1.06396443e+224, 3.96041428e+246,
        1.16318408e-028],
       [2.02503997e-052, 1.15459893e-071, 5.15155466e-062,
        2.70288732e-056, 1.01848862e+248, 1.16096643e-028,
        9.80058441e+252, 7.50189709e+247, 1.14484251e+243,
        2.59903827e-144]])


%timeit -o np.zeros((3, 10))

349 ns Â± 21.8 ns per loop (mean Â± std. dev. of 7 runs, 1,000,000 loops each)

<TimeitResult : 349 ns Â± 21.8 ns per loop (mean Â± std. dev. of 7 runs, 1,000,000 loops each)>


%timeit -o np.empty((3, 10))

318 ns Â± 14.2 ns per loop (mean Â± std. dev. of 7 runs, 1,000,000 loops each)

<TimeitResult : 318 ns Â± 14.2 ns per loop (mean Â± std. dev. of 7 runs, 1,000,000 loops each)>


%timeit -o np.zeros((10000000,))

22.6 Âµs Â± 1.45 Âµs per loop (mean Â± std. dev. of 7 runs, 10,000 loops each)

<TimeitResult : 22.6 Âµs Â± 1.45 Âµs per loop (mean Â± std. dev. of 7 runs, 10,000 loops each)>


%timeit -o np.empty((10000000,))

22.1 Âµs Â± 1.01 Âµs per loop (mean Â± std. dev. of 7 runs, 10,000 loops each)

<TimeitResult : 22.1 Âµs Â± 1.01 Âµs per loop (mean Â± std. dev. of 7 runs, 10,000 loops each)>


np.tile([[4.28, 3.23, 5.87], [1.23, 5.32, 3.33]], (2, 3))

array([[4.28, 3.23, 5.87, 4.28, 3.23, 5.87, 4.28, 3.23, 5.87],
       [1.23, 5.32, 3.33, 1.23, 5.32, 3.33, 1.23, 5.32, 3.33],
       [4.28, 3.23, 5.87, 4.28, 3.23, 5.87, 4.28, 3.23, 5.87],
       [1.23, 5.32, 3.33, 1.23, 5.32, 3.33, 1.23, 5.32, 3.33]])


np.random.random((2, 3))

array([[0.70042533, 0.86024904, 0.88020924],
       [0.24398511, 0.4127463 , 0.33376534]])


np.random.random((2, 3)) * 10. + 100.

array([[107.5161684 , 105.14098532, 107.60733515],
       [108.24829171, 106.54946845, 105.28805272]])


my_matrix = np.array([[4.28, 3.23, 5.87], [1.23, 5.32, 3.33]])
my_matrix

array([[4.28, 3.23, 5.87],
       [1.23, 5.32, 3.33]])


my_matrix.shape

(2, 3)


my_matrix.shape = (3, 2)
my_matrix

array([[4.28, 3.23],
       [5.87, 1.23],
       [5.32, 3.33]])


my_matrix.shape = (6,)
my_matrix

array([4.28, 3.23, 5.87, 1.23, 5.32, 3.33])


my_matrix.shape = ((6, 1))
my_matrix

array([[4.28],
       [3.23],
       [5.87],
       [1.23],
       [5.32],
       [3.33]])


my_matrix.shape = ((1, 6))
my_matrix

array([[4.28, 3.23, 5.87, 1.23, 5.32, 3.33]])


my_reshaped_matrix = np.reshape(my_matrix, (2, 3))
my_reshaped_matrix

array([[4.28, 3.23, 5.87],
       [1.23, 5.32, 3.33]])


my_matrix = my_reshaped_matrix


np.reshape([[4.28, 3.23, 5.87], [1.23, 5.32, 3.33]], (6, 1))

array([[4.28],
       [3.23],
       [5.87],
       [1.23],
       [5.32],
       [3.33]])


3. * my_matrix

array([[12.84,  9.69, 17.61],
       [ 3.69, 15.96,  9.99]])


my_matrix + np.array([[3.00, 0., 0.], [5., 7., 0.]])

array([[ 7.28,  3.23,  5.87],
       [ 6.23, 12.32,  3.33]])


A = np.dot(my_matrix, np.array([[3., 1.], [1., -3.], [-3., -3.]]))
A

array([[ -1.54, -23.02],
       [ -0.98, -24.72]])


my_matrix @ np.array([[3., 1.], [1., -3.], [-3., -3.]])

array([[ -1.54, -23.02],
       [ -0.98, -24.72]])


10. * my_matrix - 2. * my_matrix * my_matrix + 1.

array([[ 7.1632, 12.4342, -9.2138],
       [10.2742, -2.4048, 12.1222]])


my_matrix

array([[4.28, 3.23, 5.87],
       [1.23, 5.32, 3.33]])


10. * my_matrix

array([[42.8, 32.3, 58.7],
       [12.3, 53.2, 33.3]])


10. * my_matrix + 1.

array([[43.8, 33.3, 59.7],
       [13.3, 54.2, 34.3]])


a = np.array([[1., 3., 2.], [5., 8., 7.], [3., 2., 1]])
a

array([[1., 3., 2.],
       [5., 8., 7.],
       [3., 2., 1.]])


b = np.array([[101.], [102.], [103.]])
b

array([[101.],
       [102.],
       [103.]])


a + b

array([[102., 104., 103.],
       [107., 110., 109.],
       [106., 105., 104.]])

A.T

array([[ -1.54,  -0.98],
       [-23.02, -24.72]])


my_matrix.T

array([[4.28, 1.23],
       [3.23, 5.32],
       [5.87, 3.33]])


Ainv = np.linalg.inv(A)
Ainv

array([[-1.59389266,  1.4842803 ],
       [ 0.0631883 , -0.0992959 ]])


np.dot(A, Ainv)

array([[1., 0.],
       [0., 1.]])


np.dot(Ainv, A)

array([[ 1.00000000e+00, -7.10542736e-15],
       [-1.38777878e-17,  1.00000000e+00]])


np.eye(1)

array([[1.]])


np.eye(2)

array([[1., 0.],
       [0., 1.]])


np.eye(3)

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])


array1 = np.array([
    [10., 20., 30., 40.],
    [50., 60., 70., 80.],
    [90., 100., 110., 120.],
    [130., 140., 150., 160.],
    [170., 180., 190., 200.]])
array1

array([[ 10.,  20.,  30.,  40.],
       [ 50.,  60.,  70.,  80.],
       [ 90., 100., 110., 120.],
       [130., 140., 150., 160.],
       [170., 180., 190., 200.]])


array2 = np.array([
    [1., 2., 3.],
    [4., 5., 6.],
    [7., 8., 9.],
    [10., 11., 12.],
    [13., 14., 15.]])
array2

array([[ 1.,  2.,  3.],
       [ 4.,  5.,  6.],
       [ 7.,  8.,  9.],
       [10., 11., 12.],
       [13., 14., 15.]])


np.hstack((array1, array2))

array([[ 10.,  20.,  30.,  40.,   1.,   2.,   3.],
       [ 50.,  60.,  70.,  80.,   4.,   5.,   6.],
       [ 90., 100., 110., 120.,   7.,   8.,   9.],
       [130., 140., 150., 160.,  10.,  11.,  12.],
       [170., 180., 190., 200.,  13.,  14.,  15.]])


array1 = np.array([
    [1., 2.],
    [3., 4.],
    [5., 6.]])


array2 = np.array([
    [10., 20.]])


array3 = np.array([
    [100., 200.],
    [300., 400.],
    [500., 600.],
    [700., 800.]])


np.vstack((array1, array2, array3))

array([[  1.,   2.],
       [  3.,   4.],
       [  5.,   6.],
       [ 10.,  20.],
       [100., 200.],
       [300., 400.],
       [500., 600.],
       [700., 800.]])


a = np.array([
    [10., 20., 30., 40.],
    [50., 60., 70., 80.],
    [90., 100., 110., 120.],
    [130., 140., 150., 160.],
    [170., 180., 190., 200.]])


a[0, 0]

10.0


a[2, 3]

120.0


a[2, 3] = a[2, 3] * 10.

a

array([[  10.,   20.,   30.,   40.],
       [  50.,   60.,   70.,   80.],
       [  90.,  100.,  110., 1200.],
       [ 130.,  140.,  150.,  160.],
       [ 170.,  180.,  190.,  200.]])


a[3, :]

array([130., 140., 150., 160.])


a[:, 3]

array([  40.,   80., 1200.,  160.,  200.])


a[3, :] = 100.

a

array([[  10.,   20.,   30.,   40.],
       [  50.,   60.,   70.,   80.],
       [  90.,  100.,  110., 1200.],
       [ 100.,  100.,  100.,  100.],
       [ 170.,  180.,  190.,  200.]])


a[0:4, 1:3]

array([[ 20.,  30.],
       [ 60.,  70.],
       [100., 110.],
       [100., 100.]])


a[0:4:2, 1:3]

array([[ 20.,  30.],
       [100., 110.]])

a

array([[  10.,   20.,   30.,   40.],
       [  50.,   60.,   70.,   80.],
       [  90.,  100.,  110., 1200.],
       [ 100.,  100.,  100.,  100.],
       [ 170.,  180.,  190.,  200.]])


b = a[0:4:2, 1:3]

b

array([[ 20.,  30.],
       [100., 110.]])


b[:] = 1000.

b

array([[1000., 1000.],
       [1000., 1000.]])

a

array([[  10., 1000., 1000.,   40.],
       [  50.,   60.,   70.,   80.],
       [  90., 1000., 1000., 1200.],
       [ 100.,  100.,  100.,  100.],
       [ 170.,  180.,  190.,  200.]])


b = a[0:4:2, 1:3].copy()


a > 5

array([[ True,  True,  True,  True],
       [ True,  True,  True,  True],
       [ True,  True,  True,  True],
       [ True,  True,  True,  True],
       [ True,  True,  True,  True]])


a[a > 5]

array([  10., 1000., 1000.,   40.,   50.,   60.,   70.,   80.,   90.,
       1000., 1000., 1200.,  100.,  100.,  100.,  100.,  170.,  180.,
        190.,  200.])


a[a > 5] = 100.

a

array([[100., 100., 100., 100.],
       [100., 100., 100., 100.],
       [100., 100., 100., 100.],
       [100., 100., 100., 100.],
       [100., 100., 100., 100.]])


a = np.random.random(1000000)


np.all(3 * (5 * a) * (7 * a) == 105 * a * a)

False


np.max(np.abs(3 * (5 * a) * (7 * a) - 105 * a * a))

2.842170943040401e-14


np.max(np.abs(3 * (5 * a) * (7 * a) - 105 * a * a)) < 1e-12

True


a = np.array([
    [3.29, 3.12, 9.49, 4.28, 9.93],
    [3.21, 9.93, 0.90, 8.90, 2.33],
    [5.32, 9.90, 1.23, 9.89, 2.39]
])


np.cumsum(a)

array([ 3.29,  6.41, 15.9 , 20.18, 30.11, 33.32, 43.25, 44.15, 53.05,
       55.38, 60.7 , 70.6 , 71.83, 81.72, 84.11])


np.cumprod(a)

array([3.29000000e+00, 1.02648000e+01, 9.74129520e+01, 4.16927435e+02,
       4.14008943e+03, 1.32896871e+04, 1.31966592e+05, 1.18769933e+05,
       1.05705241e+06, 2.46293210e+06, 1.31027988e+07, 1.29717708e+08,
       1.59552781e+08, 1.57797700e+09, 3.77136504e+09])


np.min(a)

0.9


np.max(a)

9.93


np.argmin(a)

7


np.argmax(a)

4


np.mean(a)

5.607333333333333


np.var(a)

12.10776622222222


np.std(a)

3.4796215630758214


a = np.array([
    [3.29, 3.12, 9.49, 4.28, 9.93],
    [3.21, 9.93, 0.90, 8.90, 2.33],
    [5.32, 9.90, 1.23, 9.89, 2.39]
])


np.apply_along_axis(np.mean, 1, a)

array([6.022, 5.054, 5.746])


np.apply_along_axis(np.mean, 0, a)

array([3.94      , 7.65      , 3.87333333, 7.69      , 4.88333333])


a = np.array([
    [3.29, 3.12, 9.49, 4.28, 9.93],
    [3.21, 9.93, 0.90, 8.90, 2.33],
    [5.32, 9.90, 1.23, 9.89, 2.39]
])


a.flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False


a.T.flags

  C_CONTIGUOUS : False
  F_CONTIGUOUS : True
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False


a[0, 0] = 9.23


a.flags.writeable = False


a.flags.writeable = True


a[0, 0] = 1.23

a

array([[1.23, 3.12, 9.49, 4.28, 9.93],
       [3.21, 9.93, 0.9 , 8.9 , 2.33],
       [5.32, 9.9 , 1.23, 9.89, 2.39]])


np.version.version

'1.24.3'


np.show_config()

blas_armpl_info:
  NOT AVAILABLE
blas_mkl_info:
    libraries = ['mkl_rt']
    library_dirs = ['C:/Programs/Win64/Anaconda/V2023.09-0_3.11\\Library\\lib']
    define_macros = [('SCIPY_MKL_H', None), ('HAVE_CBLAS', None)]
    include_dirs = ['C:/Programs/Win64/Anaconda/V2023.09-0_3.11\\Library\\include']
blas_opt_info:
    libraries = ['mkl_rt']
    library_dirs = ['C:/Programs/Win64/Anaconda/V2023.09-0_3.11\\Library\\lib']
    define_macros = [('SCIPY_MKL_H', None), ('HAVE_CBLAS', None)]
    include_dirs = ['C:/Programs/Win64/Anaconda/V2023.09-0_3.11\\Library\\include']
lapack_armpl_info:
  NOT AVAILABLE
lapack_mkl_info:
    libraries = ['mkl_rt']
    library_dirs = ['C:/Programs/Win64/Anaconda/V2023.09-0_3.11\\Library\\lib']
    define_macros = [('SCIPY_MKL_H', None), ('HAVE_CBLAS', None)]
    include_dirs = ['C:/Programs/Win64/Anaconda/V2023.09-0_3.11\\Library\\include']
lapack_opt_info:
    libraries = ['mkl_rt']
    library_dirs = ['C:/Programs/Win64/Anaconda/V2023.09-0_3.11\\Library\\lib']
    define_macros = [('SCIPY_MKL_H', None), ('HAVE_CBLAS', None)]
    include_dirs = ['C:/Programs/Win64/Anaconda/V2023.09-0_3.11\\Library\\include']
Supported SIMD extensions in this NumPy install:
    baseline = SSE,SSE2,SSE3
    found = SSSE3,SSE41,POPCNT,SSE42,AVX,F16C,FMA3,AVX2
    not found = AVX512F,AVX512CD,AVX512_SKX,AVX512_CLX,AVX512_CNL,AVX512_ICL


a = np.array([[3.89, 3.90, 1.39], [2.90, 4.32, 8.32], [5.32, 9.90, 8.98]])
a

array([[3.89, 3.9 , 1.39],
       [2.9 , 4.32, 8.32],
       [5.32, 9.9 , 8.98]])


a = np.array([[3.89, 3.90, 1.39], [2.90, 4.32, 8.32], [5.32, 9.90, 8.98]])
a

array([[3.89, 3.9 , 1.39],
       [2.9 , 4.32, 8.32],
       [5.32, 9.9 , 8.98]])


v = np.array([[1.], [2.], [3.], [4.], [5.], [6.], [7.]])


a = np.array([
    [3.29, 3.12, 9.49, 4.28, 9.93],
    [3.21, 9.93, 0.90, 8.90, 2.33],
    [5.32, 9.90, 1.23, 9.89, 2.39]
])

	transaction date	distance to the nearest MRT station	number of convenience stores	latitude	longitude	house price per unit area
count	15.000000	15.000000	15.000000	15.000000	15.000000	15.000000
mean	2013.127867	1127.314552	5.133333	24.974952	121.530627	38.813333
std	0.347586	1425.732372	2.722044	0.011395	0.017235	11.251341
min	2012.667000	84.878820	1.000000	24.950950	121.484580	18.800000
25%	2012.875000	348.581550	3.500000	24.966230	121.524290	33.200000
50%	2013.083000	561.984500	5.000000	24.979330	121.537370	40.300000
75%	2013.458500	1474.009000	6.500000	24.981700	121.542365	44.900000
max	2013.583000	5512.038000	10.000000	24.991560	121.543910	58.100000

	transaction date	distance to the nearest MRT station	latitude	longitude	house price per unit area
number of convenience stores
1	2013.29150	2958.625700	24.962220	121.509150	30.100
3	2013.04200	1979.105000	24.965180	121.513700	27.100
4	2013.08350	1817.241500	24.976320	121.522260	29.050
5	2013.20825	501.692175	24.979860	121.541910	46.125
6	2013.41700	287.602500	24.980420	121.542280	46.700
7	2012.66700	623.473100	24.979330	121.536420	40.300
9	2013.12500	198.525380	24.977335	121.541305	50.150
10	2012.91700	84.878820	24.982980	121.540240	37.900

	transaction date	distance to the nearest MRT station	number of convenience stores	latitude	longitude	house price per unit area
0	2012.917	84.87882	10	24.98298	121.54024	37.9
1	2012.917	306.59470	9	24.98034	121.53951	42.2
2	2013.583	561.98450	5	24.98746	121.54391	47.3
3	2013.500	561.98450	5	24.98746	121.54391	54.8
4	2012.833	390.56840	5	24.97937	121.54245	43.1
5	2012.667	2175.03000	3	24.96305	121.51254	32.1
6	2012.667	623.47310	7	24.97933	121.53642	40.3
7	2013.417	287.60250	6	24.98042	121.54228	46.7
8	2013.500	5512.03800	1	24.95095	121.48458	18.8
9	2013.417	1783.18000	3	24.96731	121.51486	22.1
10	2013.083	405.21340	1	24.97349	121.53372	41.4
11	2013.333	90.45606	9	24.97433	121.54310	58.1
12	2012.917	492.23130	5	24.96515	121.53737	39.3
13	2012.667	2469.64500	4	24.96108	121.51046	23.8
14	2013.500	1164.83800	4	24.99156	121.53406	34.3

	transaction date	distance to the nearest MRT station	number of convenience stores	latitude	longitude	house price per unit area
0	2012.917	84.87882	10	24.98298	121.54024	37.9
1	2012.917	306.59470	9	24.98034	121.53951	42.2
2	2013.583	561.98450	5	24.98746	121.54391	47.3
3	2013.500	561.98450	5	24.98746	121.54391	54.8
4	2012.833	390.56840	5	24.97937	121.54245	43.1
5	2012.667	2175.03000	3	24.96305	121.51254	32.1
6	2012.667	623.47310	7	24.97933	121.53642	40.3
7	2013.417	287.60250	6	24.98042	121.54228	46.7
8	2013.500	5512.03800	1	24.95095	121.48458	18.8
9	2013.417	1783.18000	3	24.96731	121.51486	22.1

	transaction date	distance to the nearest MRT station	number of convenience stores	latitude	longitude	house price per unit area
A	2012.917	84.87882	10	24.98298	121.54024	37.9
B	2012.917	306.59470	9	24.98034	121.53951	42.2
E	2012.833	390.56840	5	24.97937	121.54245	43.1
G	2012.667	623.47310	7	24.97933	121.53642	40.3
H	2013.417	287.60250	6	24.98042	121.54228	46.7
K	2013.083	405.21340	1	24.97349	121.53372	41.4
L	2013.333	90.45606	9	24.97433	121.54310	58.1
M	2012.917	492.23130	5	24.96515	121.53737	39.3

	transaction date	distance to the nearest MRT station	number of convenience stores	latitude	longitude	house price per unit area
A	2012.917	84.87882	10	24.98298	121.54024	37.9
B	2012.917	306.59470	9	24.98034	121.53951	42.2
C	2013.583	561.98450	5	24.98746	121.54391	47.3
D	2013.500	561.98450	5	24.98746	121.54391	54.8
E	2012.833	390.56840	5	24.97937	121.54245	43.1
F	2012.667	2175.03000	3	24.96305	121.51254	32.1
G	2012.667	623.47310	7	24.97933	121.53642	40.3
H	2013.417	287.60250	6	24.98042	121.54228	46.7
I	2013.500	5512.03800	1	24.95095	121.48458	18.8
J	2013.417	1783.18000	3	24.96731	121.51486	22.1
K	2013.083	405.21340	1	24.97349	121.53372	41.4
L	2013.333	90.45606	9	24.97433	121.54310	58.1
M	2012.917	492.23130	5	24.96515	121.53737	39.3
N	2012.667	2469.64500	4	24.96108	121.51046	23.8
O	2013.500	1164.83800	4	24.99156	121.53406	34.3

Without solutions

Python libraries for working with data¶

Motivation¶

Objectives¶

pandas DataFrames and Series¶

How to quickly inspect a DataFrame¶

The index of the DataFrame, loc and iloc¶

Indexing columns¶

Iterating through columns¶

Summarising the DataFrame¶

Adding a column¶

Overwriting a column¶

Rearranging the columns¶

Deleting a column¶

Applying a function to a column¶

Filtering data¶

Dealing with missing data¶

groupby¶

Appending to a DataFrame¶

Joining on a DataFrame¶

Exercise¶

Exercise¶

Exercise¶

Exercise¶

Exercise¶

NumPy arrays¶

NumPy arrays behind the pandas DataFrame and Series¶

Defining one-dimensional (flat) NumPy arrays¶

np.arange, np.linspace, np.logspace¶

Reasons to prefer NumPy arrays over standard Python data structures, such as lists¶

Defining two-dimensional NumPy arrays (implementing matrices)¶

Other ways of creating NumPy arrays¶

Generating random matrices¶

Reshaping matrices¶

Multiplying matrices by scalars¶

Adding matrices¶

Multiplying matrices¶

Mixing arrays and scalars in arithmetic operations¶

Transposing matrices¶

Matrix inverses¶

The identity matrix¶

Stacking NumPy arrays horizontally¶

Stacking NumPy arrays vertically¶

Indexing NumPy arrays¶

Boolean indexing¶

Comparing arrays¶

Some useful functions¶

Applying a function to each row or column¶

Flags; making NumPy arrays immutable¶

Finding out more about the configuration of the NumPy library¶

Exercise¶

Exercise¶

Exercise¶

Exercise¶

Exercise¶

Exercise¶

`pandas` `DataFrame`s and `Series`¶

How to quickly inspect a `DataFrame`¶

The index of the `DataFrame`, `loc` and `iloc`¶

Summarising the `DataFrame`¶

`groupby`¶

Appending to a `DataFrame`¶

Joining on a `DataFrame`¶

NumPy arrays behind the `pandas` `DataFrame` and `Series`¶

`np.arange`, `np.linspace`, `np.logspace`¶