# Copyright (c) Thalesians Ltd, 2019-2023. All rights reserved.
# Copyright (c) Paul Alexander Bilokon, 2019-2023. All rights reserved.
# Author: Paul Alexander Bilokon <[email protected]>
# This version: 2.0 (2023.11.17)
# Previous versions: 1.0 (2019.09.05)
# Email: [email protected]


import pandas as pd


df = pd.DataFrame(
    {
        'transaction date': [2012.917, 2012.917, 2013.583, 2013.500, 2012.833, 2012.667, 2012.667, 2013.417, 2013.500, 2013.417, 2013.083, 2013.333, 2012.917, 2012.667, 2013.500],
        'distance to the nearest MRT station': [84.87882, 306.59470, 561.98450, 561.98450, 390.56840, 2175.03000, 623.47310, 287.60250, 5512.03800, 1783.18000, 405.21340, 90.45606, 492.23130, 2469.64500, 1164.83800],
        'number of convenience stores': [10, 9, 5, 5, 5, 3, 7, 6, 1, 3, 1, 9, 5, 4, 4],
        'latitude': [24.98298, 24.98034, 24.98746, 24.98746, 24.97937, 24.96305, 24.97933, 24.98042, 24.95095, 24.96731, 24.97349, 24.97433, 24.96515, 24.96108, 24.99156],
        'longitude': [121.54024, 121.53951, 121.54391, 121.54391, 121.54245, 121.51254, 121.53642, 121.54228, 121.48458, 121.51486, 121.53372, 121.54310, 121.53737, 121.51046, 121.53406],
        'house price per unit area': [37.9, 42.2, 47.3, 54.8, 43.1, 32.1, 40.3, 46.7, 18.8, 22.1, 41.4, 58.1, 39.3, 23.8, 34.3]
    },
    columns=[
        'transaction date',
        'distance to the nearest MRT station',
        'number of convenience stores',
        'latitude',
        'longitude',
        'house price per unit area'
    ])


%matplotlib inline


df['house price per unit area'].hist();


import matplotlib.pyplot as plt


plt.hist(df['house price per unit area'].values);


import numpy as np
plt.hist(df['house price per unit area'].values)
mean = np.mean(df['house price per unit area'].values)
var = np.var(df['house price per unit area'].values)
sd = np.sqrt(var)
plt.axvline(mean, color='r', label='mean')
plt.axvline(mean + sd, linestyle='--', color='r', label='mean +/- s.d.')
plt.axvline(mean - sd, linestyle='--', color='r')
plt.title('house price per unit area')
plt.xlabel('value')
plt.ylabel('frequency')
plt.legend();


plt.hist(df['latitude'].values)
mean = np.mean(df['latitude'].values)
var = np.var(df['latitude'].values)
sd = np.sqrt(var)
plt.axvline(mean, color='r', label='mean')
plt.axvline(mean + sd, linestyle='--', color='r', label='mean +/- s.d.')
plt.axvline(mean - sd, linestyle='--', color='r')
plt.title('latitude')
plt.xlabel('value')
plt.ylabel('frequency')
plt.legend()
plt.xticks(rotation=45);


plt.plot(df['number of convenience stores'].values, df['house price per unit area'], 'o')
plt.xlabel('number of convenience stores')
plt.ylabel('house price per unit area');


size = 1000
xs = np.random.normal(size=size)
ys = 3. * xs + 10. + np.random.normal(size=size)


plt.plot(xs, ys, 'o')
plt.xlabel('x')
plt.ylabel('y');


size = 1000
xs = np.random.normal(size=size)
ys = 3. * xs + 10. + 5. * np.random.normal(size=size)


plt.plot(xs, ys, 'o')
plt.xlabel('x')
plt.ylabel('y');


plt.plot(df['latitude'], df['longitude'], 'o')
plt.xticks(rotation=45)
plt.xlabel('latitude')
plt.ylabel('longitude');


plt.plot(df['latitude'], df['longitude'], 'o')
plt.xticks(rotation=45)
plt.xlabel('latitude')
plt.ylabel('longitude');
for idx, x, y in zip(df.index, df['latitude'], df['longitude']):
    plt.text(x, y, idx)

df


for idx, x, y, s in zip(df.index, df['latitude'], df['longitude'], df['house price per unit area']):
    plt.scatter(x, y, s=5*s, color='blue', alpha=.5)
    plt.text(x, y, idx)
plt.xticks(rotation=45)
plt.xlim((24.94, 25.))
plt.ylim((121.48, 121.56))
plt.xlabel('latitude')
plt.ylabel('longitude');


cm = plt.cm.get_cmap('Blues')
colours = df['house price per unit area'].values
plt.scatter(df['latitude'], df['longitude'], c=colours, alpha=.5, cmap=cm)
for idx, x, y in zip(df.index, df['latitude'], df['longitude']):
    plt.text(x, y, idx)
plt.xticks(rotation=45)
plt.xlim((24.94, 25.))
plt.ylim((121.48, 121.56))
plt.xlabel('latitude')
plt.ylabel('longitude');

C:\Users\paul\AppData\Local\Temp\ipykernel_468\511204621.py:1: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.
  cm = plt.cm.get_cmap('Blues')


cm = plt.cm.get_cmap('Blues')
colours = df['house price per unit area'].values
plt.scatter(df['latitude'], df['longitude'], c=colours, alpha=.5, cmap=cm)
plt.xticks(rotation=45)
plt.xlim((24.96, 25.))
plt.ylim((121.52, 121.56))
plt.xlabel('latitude')
plt.ylabel('longitude');

C:\Users\paul\AppData\Local\Temp\ipykernel_468\817892768.py:1: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.
  cm = plt.cm.get_cmap('Blues')


plt.plot(df['transaction date'], df['house price per unit area'], 'o');


grid = [(x, y, x*x + y*y) for x in range(-5, 6) for y in range(-5, 6)]


import seaborn as sns
data = np.empty((11, 11))
for p in grid:
    data[p[0] + 5, p[1] + 5] = p[2]
ax = sns.heatmap(data,
            xticklabels=sorted({p[0] for p in grid}),
            yticklabels=sorted({p[1] for p in grid}))
ax.set(xlabel='x', ylabel='y');

Without solutions

Visualisation¶

Motivation¶

Objectives¶

Histogram¶

Exercise¶

Rotating the tick labels¶

Exercise¶

Scatter plot¶

Exercise¶

Maps¶

Labelling individual points¶

Distinguishing points using different sizes¶

Distinguishing points using different colours¶

Zooming in and out¶

Time series plot¶

Heatmaps¶

Further reading¶

	transaction date	distance to the nearest MRT station	number of convenience stores	latitude	longitude	house price per unit area
0	2012.917	84.87882	10	24.98298	121.54024	37.9
1	2012.917	306.59470	9	24.98034	121.53951	42.2
2	2013.583	561.98450	5	24.98746	121.54391	47.3
3	2013.500	561.98450	5	24.98746	121.54391	54.8
4	2012.833	390.56840	5	24.97937	121.54245	43.1
5	2012.667	2175.03000	3	24.96305	121.51254	32.1
6	2012.667	623.47310	7	24.97933	121.53642	40.3
7	2013.417	287.60250	6	24.98042	121.54228	46.7
8	2013.500	5512.03800	1	24.95095	121.48458	18.8
9	2013.417	1783.18000	3	24.96731	121.51486	22.1
10	2013.083	405.21340	1	24.97349	121.53372	41.4
11	2013.333	90.45606	9	24.97433	121.54310	58.1
12	2012.917	492.23130	5	24.96515	121.53737	39.3
13	2012.667	2469.64500	4	24.96108	121.51046	23.8
14	2013.500	1164.83800	4	24.99156	121.53406	34.3