# -*- coding: utf-8 -*- """ Created on Fri Dec 27 20:58:48 2024 @author: ramio """ import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,ConfusionMatrixDisplay import matplotlib.pyplot as plt from scipy.sparse import hstack import nltk from nltk.corpus import stopwords import pickle # NLTK Portuguese stopwords (only needed once) nltk.download('stopwords') # Load Portuguese stopwords portuguese_stopwords = stopwords.words('portuguese') # Load the dataset file_path = 'Registo dos livros (Guardado automaticamente).xlsx' df = pd.read_excel(file_path, header=1) # Data Cleaning (drop column) df.columns = df.columns.str.strip() df = df.drop(['Unnamed: 14'], axis=1) #Filtering data (train and missing) missing_data= df [df["Tema & Localização"].isna()] # Rows where 'Tema & Localização' is missing (missing_data) train_data = df [df["Tema & Localização"].notna()] # Rows where 'Tema & Localização' is not missing (train_data) # Calculating class counts class_counts = train_data['Tema & Localização'].value_counts() print(class_counts) # Identifying rare classes (less than 5 samples) rare_classes = class_counts[class_counts < 5].index print(rare_classes) # Replacing rare classes with a new label train_data['Tema & Localização'] = train_data['Tema & Localização'].replace(rare_classes, 'Other') #Features selection x= train_data[['Titulo','Autor','Editora','Tema & Localização']] y= train_data['Tema & Localização'] # Converting text columns to numerical using TF-IDF tfidf = TfidfVectorizer(stop_words=portuguese_stopwords, max_features=1000) # Vectorizing each text column separately x_tfidf_titulo = tfidf.fit_transform(x['Titulo'].fillna('')) # Transform 'Titulo' column x_tfidf_autor = tfidf.transform(x['Autor'].fillna('')) # Transform 'Autor' column x_tfidf_editora = tfidf.transform(x['Editora'].fillna('')) # Transform 'Editora' column x_tfidf_tema = tfidf.transform(x['Tema & Localização'].fillna('')) # Transform 'Tema & Localização' column # Combining the TF-IDF features from all columns into one feature matrix x_combined = hstack([x_tfidf_titulo, x_tfidf_autor, x_tfidf_editora, x_tfidf_tema]) #Data split x_train,x_test,y_train,y_test = train_test_split (x_combined,y, test_size=0.2, random_state=42) #Train model rf_model = RandomForestClassifier(n_estimators=100, random_state=42) rf_model.fit(x_train,y_train) # Making prediction on the test set y_pred = rf_model.predict(x_test) # Calculating and print accuracy accuracy = accuracy_score(y_test, y_pred) print(f'Accuracy: {accuracy * 100:.2f}%') # Confusion matrix print('confusion matrix:') # Confusion matrix ConfusionMatrixDisplay.from_predictions( y_test, y_pred, cmap='Blues', colorbar=True ) plt.xticks(rotation=45, fontsize=5,ha='right') plt.yticks(fontsize=5) plt.title('Confusion Matrix') plt.show() # Classification report for more evaluation metrics print('Classification Report:') print(classification_report(y_test, y_pred)) """"Predicting missing values""" # Predict the missing values in 'Tema & Localização' x_missing = missing_data[['Titulo', 'Autor', 'Editora','Tema & Localização']] # Select features for rows with missing 'Tema & Localização' # Vectorizing the missing data x_missing_tfidf_titulo = tfidf.transform(x_missing['Titulo'].fillna('')) x_missing_tfidf_autor = tfidf.transform(x_missing['Autor'].fillna('')) x_missing_tfidf_editora = tfidf.transform(x_missing['Editora'].fillna('')) x_missing_tfidf_tema = tfidf.transform(x_missing['Tema & Localização'].fillna('')) # Transform 'Tema & Localização' column # Combining the TF-IDF features for the missing data x_missing_combined = hstack([x_missing_tfidf_titulo, x_missing_tfidf_autor, x_missing_tfidf_editora,x_missing_tfidf_tema]) # Predicting missing values for 'Tema & Localização' y_missing_pred = rf_model.predict(x_missing_combined) # Replaceing the missing values in the original dataframe with the predicted values df.loc[df["Tema & Localização"].isna(), 'Tema & Localização'] = y_missing_pred # Displaying the dataframe with the predicted values filled in print(df.head()) # Saving the trained model with open('book_category_model.pkl', 'wb') as f: pickle.dump(rf_model, f)