-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocess_and_combine_erasmus_data.py
More file actions
134 lines (93 loc) · 3.2 KB
/
process_and_combine_erasmus_data.py
File metadata and controls
134 lines (93 loc) · 3.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 21 15:15:35 2024
# INFO
Run the script by opening a terminal in the directory of the script and typing:
python process_and_combine_erasmus_data.py -i /path/to/excels/ -o /path/to/output/ -n combined_erasmus_data
"""
import argparse
import pandas as pd
import glob
# initialize argument parser
ap = argparse.ArgumentParser()
# set up arguments
ap.add_argument("-i", "--input", required=True,
help="Path to directory with the Erasmus+ mobility Excel\n"
" files. NOTE: Make sure the directory does not contain "
"other .xlsx files!")
ap.add_argument("-o", "--output", required=True,
help="Path to directory where resulting CSV and pickled "
"dataframe are saved.")
ap.add_argument("-n", "--name", required=True,
help="Designate the name of the output files without a "
"file extension.")
# parse arguments
args = vars(ap.parse_args())
# list of csv files
files = []
# set up input filepath
i_fp = args["input"] + "*.xlsx"
# glob all the csvs
for file in glob.glob(i_fp):
files.append(file)
# empty list for dataframes
edf = []
# loop over the csv files
for fi in files:
# get filename
fn = fi.split('/')[-1][:-5]
# get year
yr = int(fn.split('-')[-1])
# print message
print('[INFO] - Reading file ' + str(fn))
# check if file is from newer erasmus program
if yr >= 2021:
# get excel file object
xls = pd.ExcelFile(fi)
# get sheet names
sheets = xls.sheet_names
# empty list of mobilities from both programmes
emprog = []
# loop over sheets
for sh in sheets:
# read sheet
data = pd.read_excel(fi, sheet_name=sh)
# append
emprog.append(data)
# concatenate into one dataframe
df = pd.concat(emprog, ignore_index=True)
# forget list and data
del emprog, data, xls
# check if data is from earlier programme
else:
# read csv in
df = pd.read_excel(fi)
# drop NA from mobility duration
df = df.dropna(subset=['Mobility Duration'])
# drop shorter mobility
df = df[df['Mobility Duration'] >= 90]
# drop records where age isn't reported
df = df[df['Participant Age'] != '-']
# drop too young participants
df = df[df['Participant Age'].astype(int) >= 18]
# ensure all participants are students
df = df[df['Participant Profile'] == 'Learner']
# send to empty list
edf.append(df)
# final concatenation
print('[INFO] - Concatenating result...')
result = pd.concat(edf, ignore_index=True)
# get year
result['year'] = result['Mobility Start Year/Month'].apply(lambda x: int(x.split('-')[0]))
# assign mover type
result['mov_type'] = result['Mobility Duration'].apply(lambda x: 'Short-term' if x < 330 else 'Long-term')
# set up output filenames
output = args['output'] + args['name']
# save as csv
print('[INFO] - Saving result to CSV...')
result.to_csv(output + ".csv", sep=';', encoding='utf-8')
# save as pickle
print('[INFO] - Saving result to pickle...')
result.to_pickle(output + ".pkl")
print('[INFO] - ... done!')