mobitwin-erasmus/process_and_combine_erasmus_data.py at main · DigitalGeographyLab/mobitwin-erasmus · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 21 15:15:35 2024

# INFO

Run the script by opening a terminal in the directory of the script and typing:
    python process_and_combine_erasmus_data.py -i /path/to/excels/ -o /path/to/output/ -n combined_erasmus_data

"""

import argparse
import pandas as pd
import glob

# initialize argument parser
ap = argparse.ArgumentParser()

# set up arguments
ap.add_argument("-i", "--input", required=True,
                help="Path to directory with the Erasmus+ mobility Excel\n"
                " files. NOTE: Make sure the directory does not contain "
                "other .xlsx files!")

ap.add_argument("-o", "--output", required=True,
                help="Path to directory where resulting CSV and pickled "
                "dataframe are saved.")

ap.add_argument("-n", "--name", required=True,
                help="Designate the name of the output files without a "
                "file extension.")

# parse arguments
args = vars(ap.parse_args())

# list of csv files
files = []

# set up input filepath
i_fp = args["input"] + "*.xlsx"

# glob all the csvs
for file in glob.glob(i_fp):
    files.append(file)

# empty list for dataframes
edf = []

# loop over the csv files
for fi in files:

    # get filename
    fn = fi.split('/')[-1][:-5]

    # get year
    yr = int(fn.split('-')[-1])

    # print message
    print('[INFO] - Reading file ' + str(fn))

    # check if file is from newer erasmus program
    if yr >= 2021:

        # get excel file object
        xls = pd.ExcelFile(fi)

        # get sheet names
        sheets = xls.sheet_names

        # empty list of mobilities from both programmes
        emprog = []

        # loop over sheets
        for sh in sheets:

            # read sheet
            data = pd.read_excel(fi, sheet_name=sh)

            # append
            emprog.append(data)

        # concatenate into one dataframe
        df = pd.concat(emprog, ignore_index=True)

        # forget list and data
        del emprog, data, xls

    # check if data is from earlier programme
    else:

        # read csv in
        df = pd.read_excel(fi)

    # drop NA from mobility duration
    df = df.dropna(subset=['Mobility Duration'])

    # drop shorter mobility
    df = df[df['Mobility Duration'] >= 90]

    # drop records where age isn't reported
    df = df[df['Participant Age'] != '-']

    # drop too young participants
    df = df[df['Participant Age'].astype(int) >= 18]

    # ensure all participants are students
    df = df[df['Participant Profile'] == 'Learner']

    # send to empty list
    edf.append(df)

# final concatenation
print('[INFO] - Concatenating result...')
result = pd.concat(edf, ignore_index=True)

# get year
result['year'] = result['Mobility Start Year/Month'].apply(lambda x: int(x.split('-')[0]))

# assign mover type
result['mov_type'] = result['Mobility Duration'].apply(lambda x: 'Short-term' if x < 330 else 'Long-term')

# set up output filenames
output = args['output'] + args['name']

# save as csv
print('[INFO] - Saving result to CSV...')
result.to_csv(output + ".csv", sep=';', encoding='utf-8')

# save as pickle
print('[INFO] - Saving result to pickle...')
result.to_pickle(output + ".pkl")

print('[INFO] - ... done!')