0%

Lab_notes_2

Lab4 IO

Q1

1
2
3
4
5
6
file = open('./districts.txt','r')
districts = [line for line in file]
districts.sort()
for i in range(0,18):
print(districts[i])
file.close()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
Central and Western

Eastern

Islands

Kowloon City

Kwai Tsing

Kwun Tong

North

Sai Kung

Sha Tin

Sham Shui Po

Southern

Tai Po

Tsuen Wan

Tuen Mun

Wan Chai

Wong Tai Sin

Yau Tsim Mong

Yuen Long

Q2

1
2
3
4
5
6
7
8
9
10
11
import os
path = './data'
emails = []
for file in os.listdir(path):
file = open(path + '//' + file, 'r')
for line in file:
if '@' in line:
emails.append(line.rstrip('\n'))
file.close()
emails.sort()
print(emails)
1
['choi@comp.hkbu.edu.hk', 'chxw@comp.hkbu.edu.hk', 'jiming@comp.hkbu.edu.hk', 'jng@comp.hkbu.edu.hk', 'pcyuen@comp.hkbu.edu.hk', 'william@comp.hkbu.edu.hk', 'xujl@comp.hkbu.edu.hk', 'yikeguo@hkbu.edu.hk', 'ymc@comp.hkbu.edu.hk', 'ywleung@comp.hkbu.edu.hk']

Q3

1
2
3
4
5
6
7
8
with open('week4.txt','w') as file:
file.write("This is the first line\n")
file.write("This is the second line\n")
file.write("The end\n")

#new content to an existinf file
with open('week4.txt','a') as file:
file.write("Extra line added\n")

Q4 Get current directory

1
2
import os
os.getcwd()
1
'C:\\Users\\f2401539\\Desktop'

Q5

1
2
3
4
5
6
file = open(os.getcwd() + '\districts.txt','r')
districts = [line for line in file]
districts.sort()
for i in range(0,18):
print(districts[i])
file.close()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
Central and Western

Eastern

Islands

Kowloon City

Kwai Tsing

Kwun Tong

North

Sai Kung

Sha Tin

Sham Shui Po

Southern

Tai Po

Tsuen Wan

Tuen Mun

Wan Chai

Wong Tai Sin

Yau Tsim Mong

Yuen Long

Q6 write and read csv

1
2
3
4
5
6
7
8
9
10
import csv

courses = [['Course Code', 'Year', 'Semester','Course Name'],
['COMP7035', '2022-23', 'Sem A', 'Python for Data Analytics and Artificial Intelligence'],
['COMP1007', '2021-22', 'Sem B','Introduction to Python and Its Applications']]
f = open('courses.csv','w')
with f:
writer = csv.writer(f)
for row in courses:
writer.writerow(row)
1
2
3
4
5
6
7
8
import csv
f = open('courses.csv','r')

with f:
reader = csv.reader(f)
for row in reader(f)
for row in reader:
print(row)
1
2
3
4
5
6
['Course Code', 'Year', 'Semester', 'Course Name']
[]
['COMP7035', '2022-23', 'Sem A', 'Python for Data Analytics and Artificial Intelligence']
[]
['COMP1007', '2021-22', 'Sem B', 'Introduction to Python and Its Applications']
[]

Lab5 Numpy

create an array of the integers from 20 to 50

1
2
3
import numpy as np
array = np.arange(20,51)
print(array)
1
[20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50]

create an array of the integers from 0 to 50 with evenly spacing of 10

1
2
3
import numpy as np

array = np.linspace(0,50,6)
1
[ 0. 10. 20. 30. 40. 50.]

show different properties of the numpy array

1
2
3
4
5
6
7
8
9
10
11
12
13
import numpy as np

array = np.arange(20)
print(array)

array = array.reshape(4,5)
print(array)
print(type(array))
print(array.ndim)
print(array.shape)
print(array.dtype)
print(array.itemsize)
print(array.size)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]

[[ 0 1 2 3 4]
[ 5 6 7 8 9]
[10 11 12 13 14]
[15 16 17 18 19]]

<class 'numpy.ndarray'>

2

(4, 5)

int32

4

20

create an array of thre integers from 9 to 31 and print all values except the first and the last

1
2
3
import numpy as np
array = arange(9,32)
print(array[1:-1])
1
[10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30]

create an array of 5 zeros, 5 ones, 5 fives

1
2
3
4
5
6
7
8
9
10
11
12
import numpy as np
print('An array of 5 zeros:')
array = np.zeros(5)
print(array)

print('An array of 5 ones:')
array = np.ones(5)
print(array1)

print('An array of 5 fives:')
array = np.ones(5) * 5
print(2)
1
2
3
4
5
6
An array of 5 zeros:
[0. 0. 0. 0. 0.]
An array of 5 ones:
[1. 1. 1. 1. 1.]
An array of 5 fives:
[5. 5. 5. 5. 5.]

create 5x5 zero matrix with elements with the diagonal to 5,4,3,2,1

1
2
3
import numpy as np
array = np.diag([5,4,3,2,1])
print(array)
1
2
3
4
5
[[5 0 0 0 0]
[0 4 0 0 0]
[0 0 3 0 0]
[0 0 0 2 0]
[0 0 0 0 1]]

find missing item in a given array

1
2
3
4
5
6
7
import numpy as np

array = np.array([[1,1,np.nan,1],
[np.nan,1,1,1],
[1,np.nan,1,1]])
print('\nFind the missing data of the said array:')
print(np.isnan(array))
1
2
3
4
Find the missing data of the said array:
[[False False True False]
[ True False False False]
[False True False False]]

indexing row and col

1
2
3
4
5
6
7
import numpy as np
array = np.array(([5,10,15],[20,25,30],[35,40,45]))
array[1]
array[1][0]
print(array)
print('----')
print(array[:2,1:])
1
2
3
4
5
6
7
8
array([20, 25, 30])
20
[[ 5 10 15]
[20 25 30]
[35 40 45]]
----
[[10 15]
[25 30]]

Lab6 Matplotlib

Simple plots

1
2
3
4
5
6
7
import matplotlib.pylot as plt
import numpy as np
import pandas as pd

x = [1,2,3]
y = [50,100,150]
plt.plot(x,y)

Add the title and x, y label

1
2
3
4
5
6
7
8
x = [1, 2, 3]
y = [50, 100, 150]

plt.plot(x, y)
plt.title("Title")
plt.xlabel("Label X")
plt.ylabel("Label Y")
plt.show()

fit the line:xlim(), ylin()

1
2
3
4
5
6
7
8
9
10
x = [1, 2, 3]
y = [50, 100, 150]

plt.xlim(1,3)
plt.ylim(0,140)
plt.plot(x, y)
plt.title("Title")
plt.xlabel("Label X")
plt.ylabel("Label Y")
plt.show()

customize

1
2
3
4
5
6
7
8
plt.plot(x, y, color="green", marker='>', markersize=20, linestyle='dashdot')

plt.xlim(1, 3)
plt.ylim(0, 150)

plt.title('Title')
plt.xlabel('Label X')
plt.ylabel('Label Y');

example

1
2
3
4
5
6
7
8
9
10
df = read_csv('elderly.csv')
year = df['Year'].values.tolist()
print(year)
sixtyFiveAbove = df['65 years old and above'].values.tolist()
print(sixtyFiveAbove)

plt.plot(year, sixtyFiveAbove, color="green", marker='>', markersize=20, linestyle='dashdot')
plt.title('Title')
plt.xlabel('Label X')
plt.ylabel('Label Y');
1
2
[2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]
['836', '859', '892', '931', '975', '1,022', '1,067', '1,116', '1,164', '1,222', '1,297']

image-20221115231845515

Example2

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import random

n1 = 100
n2 = 1000
n3 = 100000
def rolls_plt(arg):
num = [0] * 6
y = []
x = [1,2,3,4,5,6]

for i in range(arg):
r_num = random.randint(0,5)
num[r_num] = num[r_num] + 1

for i in range(6):
y_fre = num[i] / arg
y.append(y_fre)
plt.bar(x,y,color = 'royalblue')
plt.show()

rolls_plt(n1)
#rolls_plt(n2)
#rolls_plt(n3)

image-20221115232029804

Example3

𝑥=𝑐𝑜𝑠(𝜃)x=cos(θ)

𝑦=𝑠𝑖𝑛(𝜃)+𝑐𝑜𝑠(𝜃)2/3

1
2
3
4
5
6
7
8
9
10
11
import numpy as np
from matplotlib import pyplot as plt
import math

theta = np.linspace(0, 2 * np.pi, 100)
x = np.cos(theta)
y = np.sin(theta) + np.cos(theta) ** 2/3


plt.plot(x, y)
plt.show()

image-20221115232115184

Example4

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import matplotlib.pyplot as plt
from matplotlib import animation
import numpy as np
from IPython.display import display, clear_output

fig = plt.figure() #Create a canvas to be painted

x = np.cos(theta)
y = np.sin(theta) + np.cos(theta) ** 2/3

ax = fig.subplots()
l = ax.plot(x,y)
l = l[0]

def animate(i):
l.set_data(x[:i],y[:i])
return 1

for i in range(len(x)):
animate(i)
clear_output(wait = True)
display(fig)
plt.show()

image-20221115232217987

Lab7 Seaborn/class

Write a class for Person

Basic Properties: Age, Name, Sex.

Extension Properties: Working, Sleepings, just consider the hours they work and sleep everyday.

Then, instantiate the two classes into to different persons

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
class Person:
def __init__(self, name, age, sex):
self.name = name
self.age = age
self.sex = sex
def Working(self, hours):
print(self.name + 'working' + str(hours) + "hours everyday")
def Sleep(self, hours):
print(self.name + 'sleep' + str(hours) + "hours everyday")

object1 = Person('jack',18,'male')
object2 = Person('mark',19,'male')
print(object1.name, object1.age, object1.sex)
print(object2.name, object2.age, object2.sex)
object1.Working(4)
object2.Sleep(4)
1
2
3
4
jack 18 male
mark 19 male
jack working 4 hours everyday
mark sleeping 4 hours everyday

Create a matrix with the following style(pad)

1
2
3
import numpy as np
a = [[1,2],[3,4]]
a_pad = np.pad(a,((1,9),(3,3)),'constant')
1
2
3
4
5
6
7
8
9
10
11
12
[[0 0 0 0 0 0 0 0]
[0 0 0 1 2 0 0 0]
[0 0 0 3 4 0 0 0]
[0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0]]

Create a matrix with the following style

1
2
3
4
import numpy as np
a = np.arange(1,4)
a_d = diag(a)
print(a_d)
1
2
3
4
[[1. 0. 0. 0.]
[0. 2. 0. 0.]
[0. 0. 3. 0.]
[0. 0. 0. 4.]]

Seaborn->pass

Lab8 Pandas_1

Exam1

1. Create a 5-D random numpy list var_list

1
2
import numpy as np
var_list = np.random.randn(5)

2. use “uuid” to generate 5 random keys (use str(uuid.uuid4())), and store them into a list key_list

1
2
import uuid
key_list = [str(uuid.uuid4())[:6] for i in range(5)]

3. Create a dictionary dict from var_list and key_list

1
dict_tmp = {key_list[i]:var_list[i] for i in range(5)}

4. Create a Pandas Series from a) var_list b) var_list and key_list c) dict

1
2
3
4
import pandas as pd
pd_series_var = pd.Series(var_list)
pd_series_var_key = pd.Series(var_list,key_list)
pd_series_dict = pd.Series(dict_tmp)

5. Convert the Series back to the list and dictionary

1
2
var_list_new = pd_series_var_key.to_list()
dict_new = pd_series_dict.to_dict()

6. Find out the elements larger than zero

1
pd_series_positive = pd.series_dict[pd_series_dict > 0]

7. Calculate the proportion of positive elements in the Series

1
proportion = len(pd_series_postive)/len(pd_series_dict)

8. Write down as many ways of forming a list that contains the values of Series elements

1
2
3
4
5
6
7
8
val_1 = pd_series_dict.to_list()
val_2 = []

for idx, ival in pd_series_dict.iteritems():
val_2.append(ival)
val_3 = pd_series_dict.values
val_tmp = pd_series_dict.index
val_4 = [pd_series_dict[ikey] for ikey in pd_series_dict.index]

9. Calculate the proportion of elements that are larger than the mean value of the Series

1
2
3
mean_val = np.mean(pd_series_dict)
pd_series_larger_than_mean = pd_series_dict[pd_series_dict > mean_val]
proportion_2 = len(pd_series_larger_than_mean)/len(pd_series_dict)

Exam2

1. Write codes to create a random dict x which has 5 random keys and each key corresponds to a 6-D numpy array

1
2
3
4
5
6
7
import uuid
data = {}
key_list = []
for i in range(5):
rand_key = str(uuid.uuid4())[:6]
key_list.append(rand_key)
data[rand_key] = np.random.randn(6)

2. Create a pandas dataframe using x

1
2
df = pd.DataFrame(data)
print(df)

3. Create a pandas dataframe using a subset of x, in the subset of x, only keys that start with a digit are chosen

1
2
3
4
sub_key_list = [i_key for i_key in key_list if i_key[0] in '0123456789']
print(sub_key_list)
df_sub = pd.DataFrame(data,columns = sub_key_list)
print(df_sub)

4. Create a new pandas dataframe using the codes in the previous slide

1
2
3
dates = pd.data_range('1/1/2000',periods = 8)
df = pd.DataFrame(np.random.randn(8,4),index = dates, columns = ['A','B','C','D'])
print(df)

5. Select rows whose attribute A is smaller than the mean of attribute C

1
2
3
4
5
df_c = df['C']
print(df_c)
mean_c = np.mean(df_c)
print(mean_c)
print(df[df['A']<mean_c])

6. Can you select the column B and C using [] indexing? Try it out and see what happens

1
2
df_bc = df[['B','C']]
print(df_bc)

Exam3

convert a Panda module Series to Python list

1
2
3
4
5
6
7
8
9
10
import pandas as pd

series = pd.Series([1, 2, 3, 4, 5])
print("Pandas Series and type")
print(series)
print(type(series))

print("Convert Pandas Series to Python list")
print(series.tolist())
print(type(series.tolist()))
1
2
3
4
5
6
7
8
9
10
11
Pandas Series and type
0 1
1 2
2 3
3 4
4 5
dtype: int64
<class 'pandas.core.series.Series'>
Convert Pandas Series to Python list
[1, 2, 3, 4, 5]
<class 'list'>

convert a dictionary to a Pandas series

1
2
3
4
5
6
7
8
9
import pandas as pd

dict = {'a': 100, 'b': 200, 'c':300, 'd':400, 'e':500}
print("Original dictionary:")
print(dict)

new_series = pd.Series(dict)
print("Converted series:")
print(new_series)
1
2
3
4
5
6
7
8
9
Original dictionary:
{'a': 100, 'b': 200, 'c': 300, 'd': 400, 'e': 500}
Converted series:
a 100
b 200
c 300
d 400
e 500
dtype: int64

convert a NumPy array to a Pandas series

1
2
3
4
5
6
7
8
9
10
import numpy as np
import pandas as pd

np_array = np.array([1, 2, 3, 4, 5])
print("NumPy array:")
print(np_array)

new_series = pd.Series(np_array)
print("Converted Pandas series:")
print(new_series)
1
2
3
4
5
6
7
8
9
NumPy array:
[1 2 3 4 5]
Converted Pandas series:
0 1
1 2
2 3
3 4
4 5
dtype: int32

convert the column of a DataFrame as a Series

1
2
3
4
5
6
7
8
9
10
11
12
13
import pandas as pd

d = {'col1': [1, 2, 3, 4, 7, 11], 'col2': [4, 5, 6, 9, 5, 0], 'col3': [7, 5, 8, 12, 1,11]}
df = pd.DataFrame(data=d)
print(type(df))

print("Original DataFrame")
print(df)
s1 = df.iloc[:,0]

print("\n1st column as a Series:")
print(s1)
print(type(s1))
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
<class 'pandas.core.frame.DataFrame'>
Original DataFrame
col1 col2 col3
0 1 4 7
1 2 5 5
2 3 6 8
3 4 9 12
4 7 5 1
5 11 0 11

1st column as a Series:
0 1
1 2
2 3
3 4
4 7
5 11
Name: col1, dtype: int64
<class 'pandas.core.series.Series'>

Create a subset of a series

1
2
3
4
5
6
7
8
9
10
mport pandas as pd

s = pd.Series([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15])
print("Original Data Series:")
print(s)

print("\nSubset of the above Data Series:")
n = 10
new_s = s[s > n]
print(new_s)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
Original Data Series:
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
10 10
11 11
12 12
13 13
14 14
15 15
dtype: int64

Subset of the above Data Series:
11 11
12 12
13 13
14 14
15 15
dtype: int64

Lab9 Pandas_2

1
2
import numpy as np
import pandas as pd

1. Write codes to create two DataFrames df_left, df_right, with the columns as “[key, lval1, lval2]” and “[key, rval1, rval2]”, and the values are “[a,b,c]”, and “[b,c,d]” respectively. Generate random numbers with normal distribution to for the “lval” and “rval” elements

1
2
3
4
5
6
split_str = '_______'
left_df = pd.DataFrame({'key':['a','b','c'],'lval1':np.random.randn(3),'lval2':np.random.randn(3)})
right_df = pd.DataFrame({'key':list('bcd'),'rval1':np.random.randn(3),'rval2':np.randoom.randn(3)})
print(left_df)
print(split_str)
print(right_df)
1
2
3
4
5
6
7
8
9
  key     lval1     lval2
0 a -0.306740 0.370246
1 b -1.633727 -0.351369
2 c 1.558975 -0.179692
---------
key rval1 rval2
0 b -0.036699 0.724182
1 c -1.241680 -1.695795
2 d 1.580775 -1.271330

2. Compute the left outer join of df_left and df_right, check out the results

1
2
left_merge = pd.merge(left_df, right_df, how = 'left')
print(left_merge)
1
2
3
4
  key     lval1     lval2     rval1     rval2
0 a -0.306740 0.370246 NaN NaN
1 b -1.633727 -0.351369 -0.036699 0.724182
2 c 1.558975 -0.179692 -1.241680 -1.695795

3. Change the name “key” of df_left to “key_left”, re-run step 2 and see what happens

1
left_df.columns = ['key_left','lval1','lval2']

4. Compute the right outer join of df_left and df_right in step 2, check out the results

1
2
right_merge = pd.merge(left_df, right_df, how = 'right')
print(right_merge)
1
2
3
4
  key     lval1     lval2     rval1     rval2
0 b -1.633727 -0.351369 -0.036699 0.724182
1 c 1.558975 -0.179692 -1.241680 -1.695795
2 d NaN NaN 1.580775 -1.271330

5. Compute the full outer join of df_left and df_right in step 2, check out the results

1
2
outer_merge = pd.merge(left_df, right_df, how = 'outer')
print(outer_merge)
1
2
3
4
5
  key     lval1     lval2     rval1     rval2
0 a -0.306740 0.370246 NaN NaN
1 b -1.633727 -0.351369 -0.036699 0.724182
2 c 1.558975 -0.179692 -1.241680 -1.695795
3 d NaN NaN 1.580775 -1.271330

6. Compute the inner join of df_left and df_right in step 2, check out the results

1
2
inner_merge = pd.merge(left_df, right_df, how = 'inner')
print(inner_merge)
1
2
3
  key     lval1     lval2     rval1     rval2
0 b -1.633727 -0.351369 -0.036699 0.724182
1 c 1.558975 -0.179692 -1.241680 -1.695795

7. Get the floating value columns of df_left (lval1,lval2), get the square root of the absolute values using apply

1
2
3
4
5
6
7
8
9
print(left_df)
left_df_val = left_df[['lval1','lval2']]
print(left_df_val)

left_df_val_abs = left_df_val.apply(np.abs)
print(left_df_val_abs)

left_df_val_abs_sqrt = left_df_val_abs.apply(np.sqrt)
print(left_df_val_abs_sqrt)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
  key     lval1     lval2
0 a -0.306740 0.370246
1 b -1.633727 -0.351369
2 c 1.558975 -0.179692
lval1 lval2
0 -0.306740 0.370246
1 -1.633727 -0.351369
2 1.558975 -0.179692
lval1 lval2
0 0.306740 0.370246
1 1.633727 0.351369
2 1.558975 0.179692
lval1 lval2
0 0.553841 0.608478
1 1.278173 0.592764
2 1.248589 0.423901

8. Try using numpy to directly calculate the above operations on df_left

1
2
left_df_val_abs_sqrt_np = np.sqrt(np.abs(left_df_val))
print(left_df_val_abs_sqrt_np)
1
2
3
4
      lval1     lval2
0 0.553841 0.608478
1 1.278173 0.592764
2 1.248589 0.423901

9. Write the apply_map functions to accomplish step 7

1
2
3
4
5
6
7
print(left_df)
left_df_val = left_df[['lval1','lval2']]
print(left_df_val)
left_df_val_abs = left_df_val.applymap(np.abs)
print(left_df_val_abs)
left_df_val_abs_sqrt = left_df_val_abs.applymap(np.sqrt)
print(left_df_val_abs_sqrt)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
  key     lval1     lval2
0 a -0.306740 0.370246
1 b -1.633727 -0.351369
2 c 1.558975 -0.179692
lval1 lval2
0 -0.306740 0.370246
1 -1.633727 -0.351369
2 1.558975 -0.179692
lval1 lval2
0 0.306740 0.370246
1 1.633727 0.351369
2 1.558975 0.179692
lval1 lval2
0 0.553841 0.608478
1 1.278173 0.592764
2 1.248589 0.423901

10. Get the data of “Countries and dependencies by area” from wiki and save to the excel excluding index

1
2
3
4
5
6
7
8
9
import requests
url_wiki = 'https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_area'
r = requests.get(url_wiki,headers ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0;Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124Safari/537.36'})

data = pd.read_html(r.text)
print([idata.shape for idata in data])
data_area = data[1]
print(data_area)
data_area.to_excel('area_info.xlsx',index = False)