Lab4 IO

file = open('./districts.txt','r')
districts = [line for line in file]
districts.sort()
for i in range(0,18):
    print(districts[i])
file.close()

Central and Western

Eastern

Islands

Kowloon City

Kwai Tsing

Kwun Tong

North

Sai Kung

Sha Tin

Sham Shui Po

Southern

Tai Po

Tsuen Wan

Tuen Mun

Wan Chai

Wong Tai Sin

Yau Tsim Mong

Yuen Long

import os
path = './data'
emails = []
for file in os.listdir(path):
    file = open(path + '//' + file, 'r')
    for line in file:
        if '@' in line:
            emails.append(line.rstrip('\n'))
    file.close()
emails.sort()
print(emails)

['choi@comp.hkbu.edu.hk', 'chxw@comp.hkbu.edu.hk', 'jiming@comp.hkbu.edu.hk', 'jng@comp.hkbu.edu.hk', 'pcyuen@comp.hkbu.edu.hk', 'william@comp.hkbu.edu.hk', 'xujl@comp.hkbu.edu.hk', 'yikeguo@hkbu.edu.hk', 'ymc@comp.hkbu.edu.hk', 'ywleung@comp.hkbu.edu.hk']

with open('week4.txt','w') as file:
    file.write("This is the first line\n")
    file.write("This is the second line\n")
    file.write("The end\n")
    
#new content to an existinf file
with open('week4.txt','a') as file:
    file.write("Extra line added\n")

Q4 Get current directory

1 2	import os os.getcwd()

1	'C:\\Users\\f2401539\\Desktop'

file = open(os.getcwd() + '\districts.txt','r')
districts = [line for line in file]
districts.sort()
for i in range(0,18):
    print(districts[i])
file.close()

Central and Western

Eastern

Islands

Kowloon City

Kwai Tsing

Kwun Tong

North

Sai Kung

Sha Tin

Sham Shui Po

Southern

Tai Po

Tsuen Wan

Tuen Mun

Wan Chai

Wong Tai Sin

Yau Tsim Mong

Yuen Long

Q6 write and read csv

import csv

courses = [['Course Code', 'Year', 'Semester','Course Name'],
           ['COMP7035', '2022-23', 'Sem A', 'Python for Data Analytics and Artificial Intelligence'],
           ['COMP1007', '2021-22', 'Sem B','Introduction to Python and Its Applications']]
f = open('courses.csv','w')
with f:
    writer = csv.writer(f)
    for row in courses:
        writer.writerow(row)

import csv
f = open('courses.csv','r')

with f:
    reader = csv.reader(f)
    for row in reader(f)
    for row in reader:
        print(row)

['Course Code', 'Year', 'Semester', 'Course Name']
[]
['COMP7035', '2022-23', 'Sem A', 'Python for Data Analytics and Artificial Intelligence']
[]
['COMP1007', '2021-22', 'Sem B', 'Introduction to Python and Its Applications']
[]

Lab5 Numpy

create an array of the integers from 20 to 50

1
2
3

import numpy as np
array = np.arange(20,51)
print(array)

1	[20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50]

create an array of the integers from 0 to 50 with evenly spacing of 10

1
2
3

import numpy as np

array = np.linspace(0,50,6)

1	[ 0. 10. 20. 30. 40. 50.]

show different properties of the numpy array

import numpy as np

array = np.arange(20)
print(array)

array = array.reshape(4,5)
print(array)
print(type(array))
print(array.ndim)
print(array.shape)
print(array.dtype)
print(array.itemsize)
print(array.size)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]

[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]]
 
<class 'numpy.ndarray'>

2

(4, 5)

int32

4

20

create an array of thre integers from 9 to 31 and print all values except the first and the last

1
2
3

import numpy as np
array = arange(9,32)
print(array[1:-1])

1	[10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30]

create an array of 5 zeros, 5 ones, 5 fives

import numpy as np
print('An array of 5 zeros:')
array = np.zeros(5)
print(array)

print('An array of 5 ones:')
array = np.ones(5)
print(array1)

print('An array of 5 fives:')
array = np.ones(5) * 5
print(2)

An array of 5 zeros:
[0. 0. 0. 0. 0.]
An array of 5 ones:
[1. 1. 1. 1. 1.]
An array of 5 fives:
[5. 5. 5. 5. 5.]

create 5x5 zero matrix with elements with the diagonal to 5,4,3,2,1

1
2
3

import numpy as np
array = np.diag([5,4,3,2,1])
print(array)

[[5 0 0 0 0]
 [0 4 0 0 0]
 [0 0 3 0 0]
 [0 0 0 2 0]
 [0 0 0 0 1]]

find missing item in a given array

import numpy as np

array = np.array([[1,1,np.nan,1],
              [np.nan,1,1,1],
              [1,np.nan,1,1]])
print('\nFind the missing data of the said array:')
print(np.isnan(array))

Find the missing data of the said array:
[[False False  True False]
 [ True False False False]
 [False  True False False]]

indexing row and col

import numpy as np
array = np.array(([5,10,15],[20,25,30],[35,40,45]))
array[1]
array[1][0]
print(array)
print('----')
print(array[:2,1:])

array([20, 25, 30])
20
[[ 5 10 15]
 [20 25 30]
 [35 40 45]]
----
[[10 15]
 [25 30]]

Lab6 Matplotlib

Simple plots

import matplotlib.pylot as plt
import numpy as np
import pandas as pd

x = [1,2,3]
y = [50,100,150]
plt.plot(x,y)

Add the title and x, y label

x = [1, 2, 3]
y = [50, 100, 150]

plt.plot(x, y)
plt.title("Title")
plt.xlabel("Label X")
plt.ylabel("Label Y")
plt.show()

fit the line:xlim(), ylin()

x = [1, 2, 3]
y = [50, 100, 150]

plt.xlim(1,3)
plt.ylim(0,140)
plt.plot(x, y)
plt.title("Title")
plt.xlabel("Label X")
plt.ylabel("Label Y")
plt.show()

customize

plt.plot(x, y, color="green", marker='>', markersize=20, linestyle='dashdot')

plt.xlim(1, 3)
plt.ylim(0, 150)

plt.title('Title')
plt.xlabel('Label X')
plt.ylabel('Label Y');

example

df = read_csv('elderly.csv')
year = df['Year'].values.tolist()
print(year)
sixtyFiveAbove = df['65 years old and above'].values.tolist()
print(sixtyFiveAbove)

plt.plot(year, sixtyFiveAbove, color="green", marker='>', markersize=20, linestyle='dashdot')
plt.title('Title')
plt.xlabel('Label X')
plt.ylabel('Label Y');

1 2	[2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020] ['836', '859', '892', '931', '975', '1,022', '1,067', '1,116', '1,164', '1,222', '1,297']

Example2

import random

n1 = 100
n2 = 1000
n3 = 100000
def rolls_plt(arg):
    num = [0] * 6
    y = []
    x = [1,2,3,4,5,6]

    for i in range(arg):
        r_num = random.randint(0,5)
        num[r_num] = num[r_num] + 1

    for i in range(6):
        y_fre = num[i] / arg
        y.append(y_fre)
    plt.bar(x,y,color = 'royalblue')
    plt.show()

rolls_plt(n1)
#rolls_plt(n2)
#rolls_plt(n3)

Example3

𝑥=𝑐𝑜𝑠(𝜃)x=cos(θ)

𝑦=𝑠𝑖𝑛(𝜃)+𝑐𝑜𝑠(𝜃)2/3

import numpy as np
from matplotlib import pyplot as plt
import math

theta = np.linspace(0, 2 * np.pi, 100)
x = np.cos(theta)
y = np.sin(theta) + np.cos(theta) ** 2/3


plt.plot(x, y)
plt.show()

Example4

import matplotlib.pyplot as plt
from matplotlib import animation
import numpy as np
from IPython.display import display, clear_output

fig = plt.figure() #Create a canvas to be painted

x = np.cos(theta)
y = np.sin(theta) + np.cos(theta) ** 2/3

ax = fig.subplots()
l = ax.plot(x,y)
l = l[0]

def animate(i):
    l.set_data(x[:i],y[:i])
    return 1

for i in range(len(x)):
    animate(i)
    clear_output(wait = True)
    display(fig)
plt.show()

Lab7 Seaborn/class

Write a class for Person

Basic Properties: Age, Name, Sex.

Extension Properties: Working, Sleepings, just consider the hours they work and sleep everyday.

Then, instantiate the two classes into to different persons

class Person:
    def __init__(self, name, age, sex):
        self.name = name
        self.age = age
        self.sex = sex
    def Working(self, hours):
        print(self.name + 'working' + str(hours) + "hours everyday")
    def Sleep(self, hours):
        print(self.name + 'sleep' + str(hours) + "hours everyday")
        
object1 = Person('jack',18,'male')
object2 = Person('mark',19,'male')
print(object1.name, object1.age, object1.sex)
print(object2.name, object2.age, object2.sex)
object1.Working(4)
object2.Sleep(4)

jack 18 male
mark 19 male
jack working 4 hours everyday
mark sleeping 4 hours everyday

Create a matrix with the following style(pad)

1
2
3

import numpy as np
a = [[1,2],[3,4]]
a_pad = np.pad(a,((1,9),(3,3)),'constant')

[[0 0 0 0 0 0 0 0]
 [0 0 0 1 2 0 0 0]
 [0 0 0 3 4 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]]

Create a matrix with the following style

import numpy as np
a = np.arange(1,4)
a_d = diag(a)
print(a_d)

[[1. 0. 0. 0.]
 [0. 2. 0. 0.]
 [0. 0. 3. 0.]
 [0. 0. 0. 4.]]

Seaborn->pass

Lab8 Pandas_1

Exam1

1. Create a 5-D random numpy list var_list

1 2	import numpy as np var_list = np.random.randn(5)

2. use “uuid” to generate 5 random keys (use str(uuid.uuid4())), and store them into a list key_list

1 2	import uuid key_list = [str(uuid.uuid4())[:6] for i in range(5)]

3. Create a dictionary dict from var_list and key_list

1	dict_tmp = {key_list[i]:var_list[i] for i in range(5)}

4. Create a Pandas Series from a) var_list b) var_list and key_list c) dict

import pandas as pd
pd_series_var = pd.Series(var_list)
pd_series_var_key = pd.Series(var_list,key_list)
pd_series_dict = pd.Series(dict_tmp)

5. Convert the Series back to the list and dictionary

1 2	var_list_new = pd_series_var_key.to_list() dict_new = pd_series_dict.to_dict()

6. Find out the elements larger than zero

1	pd_series_positive = pd.series_dict[pd_series_dict > 0]

7. Calculate the proportion of positive elements in the Series

1	proportion = len(pd_series_postive)/len(pd_series_dict)

8. Write down as many ways of forming a list that contains the values of Series elements

val_1 = pd_series_dict.to_list()
val_2 = []

for idx, ival in pd_series_dict.iteritems():
    val_2.append(ival)
val_3 = pd_series_dict.values
val_tmp = pd_series_dict.index
val_4 = [pd_series_dict[ikey] for ikey in pd_series_dict.index]

9. Calculate the proportion of elements that are larger than the mean value of the Series

1
2
3

mean_val = np.mean(pd_series_dict)
pd_series_larger_than_mean = pd_series_dict[pd_series_dict > mean_val]
proportion_2 = len(pd_series_larger_than_mean)/len(pd_series_dict)

Exam2

1. Write codes to create a random dict x which has 5 random keys and each key corresponds to a 6-D numpy array

import uuid
data = {}
key_list = []
for i in range(5):
    rand_key = str(uuid.uuid4())[:6]
    key_list.append(rand_key)
    data[rand_key] = np.random.randn(6)

2. Create a pandas dataframe using x

1 2	df = pd.DataFrame(data) print(df)

3. Create a pandas dataframe using a subset of x, in the subset of x, only keys that start with a digit are chosen

sub_key_list = [i_key for i_key in key_list if i_key[0] in '0123456789']
print(sub_key_list)
df_sub = pd.DataFrame(data,columns = sub_key_list)
print(df_sub)

4. Create a new pandas dataframe using the codes in the previous slide

1
2
3

dates = pd.data_range('1/1/2000',periods = 8)
df = pd.DataFrame(np.random.randn(8,4),index = dates, columns = ['A','B','C','D'])
print(df)

5. Select rows whose attribute A is smaller than the mean of attribute C

df_c = df['C']
print(df_c)
mean_c = np.mean(df_c)
print(mean_c)
print(df[df['A']<mean_c])

6. Can you select the column B and C using [] indexing? Try it out and see what happens

1 2	df_bc = df[['B','C']] print(df_bc)

Exam3

convert a Panda module Series to Python list

import pandas as pd

series = pd.Series([1, 2, 3, 4, 5])
print("Pandas Series and type")
print(series)
print(type(series))

print("Convert Pandas Series to Python list")
print(series.tolist())
print(type(series.tolist()))

Pandas Series and type
0    1
1    2
2    3
3    4
4    5
dtype: int64
<class 'pandas.core.series.Series'>
Convert Pandas Series to Python list
[1, 2, 3, 4, 5]
<class 'list'>

convert a dictionary to a Pandas series

import pandas as pd

dict = {'a': 100, 'b': 200, 'c':300, 'd':400, 'e':500}
print("Original dictionary:")
print(dict)

new_series = pd.Series(dict)
print("Converted series:")
print(new_series)

Original dictionary:
{'a': 100, 'b': 200, 'c': 300, 'd': 400, 'e': 500}
Converted series:
a    100
b    200
c    300
d    400
e    500
dtype: int64

convert a NumPy array to a Pandas series

import numpy as np
import pandas as pd

np_array = np.array([1, 2, 3, 4, 5])
print("NumPy array:")
print(np_array)

new_series = pd.Series(np_array)
print("Converted Pandas series:")
print(new_series)

NumPy array:
[1 2 3 4 5]
Converted Pandas series:
0    1
1    2
2    3
3    4
4    5
dtype: int32

convert the column of a DataFrame as a Series

import pandas as pd

d = {'col1': [1, 2, 3, 4, 7, 11], 'col2': [4, 5, 6, 9, 5, 0], 'col3': [7, 5, 8, 12, 1,11]}
df = pd.DataFrame(data=d)
print(type(df))

print("Original DataFrame")
print(df)
s1 = df.iloc[:,0]

print("\n1st column as a Series:")
print(s1)
print(type(s1))

<class 'pandas.core.frame.DataFrame'>
Original DataFrame
   col1  col2  col3
0     1     4     7
1     2     5     5
2     3     6     8
3     4     9    12
4     7     5     1
5    11     0    11

1st column as a Series:
0     1
1     2
2     3
3     4
4     7
5    11
Name: col1, dtype: int64
<class 'pandas.core.series.Series'>

Create a subset of a series

mport pandas as pd

s = pd.Series([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15])
print("Original Data Series:")
print(s)

print("\nSubset of the above Data Series:")
n = 10
new_s = s[s > n]
print(new_s)

Original Data Series:
0      0
1      1
2      2
3      3
4      4
5      5
6      6
7      7
8      8
9      9
10    10
11    11
12    12
13    13
14    14
15    15
dtype: int64

Subset of the above Data Series:
11    11
12    12
13    13
14    14
15    15
dtype: int64

Lab9 Pandas_2

1 2	import numpy as np import pandas as pd

1. Write codes to create two DataFrames df_left, df_right, with the columns as “[key, lval1, lval2]” and “[key, rval1, rval2]”, and the values are “[a,b,c]”, and “[b,c,d]” respectively. Generate random numbers with normal distribution to for the “lval” and “rval” elements

split_str = '_______'
left_df = pd.DataFrame({'key':['a','b','c'],'lval1':np.random.randn(3),'lval2':np.random.randn(3)})
right_df = pd.DataFrame({'key':list('bcd'),'rval1':np.random.randn(3),'rval2':np.randoom.randn(3)})
print(left_df)
print(split_str)
print(right_df)

  key     lval1     lval2
0   a -0.306740  0.370246
1   b -1.633727 -0.351369
2   c  1.558975 -0.179692
---------
  key     rval1     rval2
0   b -0.036699  0.724182
1   c -1.241680 -1.695795
2   d  1.580775 -1.271330

2. Compute the left outer join of df_left and df_right, check out the results

1 2	left_merge = pd.merge(left_df, right_df, how = 'left') print(left_merge)

  key     lval1     lval2     rval1     rval2
0   a -0.306740  0.370246       NaN       NaN
1   b -1.633727 -0.351369 -0.036699  0.724182
2   c  1.558975 -0.179692 -1.241680 -1.695795

3. Change the name “key” of df_left to “key_left”, re-run step 2 and see what happens

1	left_df.columns = ['key_left','lval1','lval2']

4. Compute the right outer join of df_left and df_right in step 2, check out the results

1 2	right_merge = pd.merge(left_df, right_df, how = 'right') print(right_merge)

  key     lval1     lval2     rval1     rval2
0   b -1.633727 -0.351369 -0.036699  0.724182
1   c  1.558975 -0.179692 -1.241680 -1.695795
2   d       NaN       NaN  1.580775 -1.271330

5. Compute the full outer join of df_left and df_right in step 2, check out the results

1 2	outer_merge = pd.merge(left_df, right_df, how = 'outer') print(outer_merge)

  key     lval1     lval2     rval1     rval2
0   a -0.306740  0.370246       NaN       NaN
1   b -1.633727 -0.351369 -0.036699  0.724182
2   c  1.558975 -0.179692 -1.241680 -1.695795
3   d       NaN       NaN  1.580775 -1.271330

6. Compute the inner join of df_left and df_right in step 2, check out the results

1 2	inner_merge = pd.merge(left_df, right_df, how = 'inner') print(inner_merge)

1
2
3

  key     lval1     lval2     rval1     rval2
0   b -1.633727 -0.351369 -0.036699  0.724182
1   c  1.558975 -0.179692 -1.241680 -1.695795

7. Get the floating value columns of df_left (lval1,lval2), get the square root of the absolute values using apply

print(left_df)
left_df_val = left_df[['lval1','lval2']]
print(left_df_val)

left_df_val_abs = left_df_val.apply(np.abs)
print(left_df_val_abs)

left_df_val_abs_sqrt = left_df_val_abs.apply(np.sqrt)
print(left_df_val_abs_sqrt)

  key     lval1     lval2
0   a -0.306740  0.370246
1   b -1.633727 -0.351369
2   c  1.558975 -0.179692
      lval1     lval2
0 -0.306740  0.370246
1 -1.633727 -0.351369
2  1.558975 -0.179692
      lval1     lval2
0  0.306740  0.370246
1  1.633727  0.351369
2  1.558975  0.179692
      lval1     lval2
0  0.553841  0.608478
1  1.278173  0.592764
2  1.248589  0.423901

8. Try using numpy to directly calculate the above operations on df_left

1 2	left_df_val_abs_sqrt_np = np.sqrt(np.abs(left_df_val)) print(left_df_val_abs_sqrt_np)

      lval1     lval2
0  0.553841  0.608478
1  1.278173  0.592764
2  1.248589  0.423901

9. Write the apply_map functions to accomplish step 7

print(left_df)
left_df_val = left_df[['lval1','lval2']]
print(left_df_val)
left_df_val_abs = left_df_val.applymap(np.abs)
print(left_df_val_abs)
left_df_val_abs_sqrt = left_df_val_abs.applymap(np.sqrt)
print(left_df_val_abs_sqrt)

  key     lval1     lval2
0   a -0.306740  0.370246
1   b -1.633727 -0.351369
2   c  1.558975 -0.179692
      lval1     lval2
0 -0.306740  0.370246
1 -1.633727 -0.351369
2  1.558975 -0.179692
      lval1     lval2
0  0.306740  0.370246
1  1.633727  0.351369
2  1.558975  0.179692
      lval1     lval2
0  0.553841  0.608478
1  1.278173  0.592764
2  1.248589  0.423901

10. Get the data of “Countries and dependencies by area” from wiki and save to the excel excluding index

import requests
url_wiki = 'https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_area'
r = requests.get(url_wiki,headers ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0;Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124Safari/537.36'})

data = pd.read_html(r.text)
print([idata.shape for idata in data])
data_area = data[1]
print(data_area)
data_area.to_excel('area_info.xlsx',index = False)

倬倬吃三碗

Lab_notes_2