-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_loader.py
52 lines (43 loc) · 2.2 KB
/
data_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
"""
Module to load and clean course data and similarity matrix.
"""
import os
import pickle
import pandas as pd
def load_clean_data():
"""
Load and cleans the necessary data file and model (i.e similarity matrix and course data).
This function loads the precomputed similarity matrix and the course data from CSV files.
It handles any potential errors during file loading and logs them for debugging.
It also ensures that the data does not have duplicates and ascii characters in the data are
removed.
Returns:
- data: Clean DataFrame containing course information.
- similarity_matrix: Precomputed similarity matrix for courses.
- course_names: List of course names for use in the recommendation system.
"""
try:
# Get the absolute path to the current directory (where the script is located)
current_dir = os.path.dirname(os.path.abspath(__file__))
# Build paths to the necessary files in the 'models' and 'data' directories
similarity_matrix_path = os.path.join(current_dir, 'models', 'similarity_matrix.pkl')
data_path = os.path.join(current_dir, 'data', 'coursera.csv')
# Load the files
with open(similarity_matrix_path, 'rb') as f:
similarity_matrix = pickle.load(f)
data = pd.read_csv(data_path, encoding='utf-8')
except (FileNotFoundError, pickle.UnpicklingError, pd.errors.EmptyDataError) as e:
# Log the error and raise an exception if loading fails
print(f"Error loading files: {e}")
raise Exception(f"Error loading files: {e}")
# Drop duplicates from the course data based on key columns
data = data.drop_duplicates(subset=['Course Name', 'University', 'Difficulty Level',
'Course Rating', 'Course URL', 'Course Description'])
# Function to remove non-ASCII characters
def remove_non_ascii(text):
return text.encode('ascii', 'ignore').decode('ascii') if isinstance(text, str) else text
for column in data:
data[column] = data[column].apply(remove_non_ascii)
# Extract the list of course names for use in the recommendation system
course_names = data['Course Name'].tolist()
return data, similarity_matrix, course_names