Skip to content
Snippets Groups Projects
Commit 4d96e97f authored by Edward Mawuko Samlafo-Adams's avatar Edward Mawuko Samlafo-Adams
Browse files

adjustments to model training

parent b36853d0
No related branches found
No related tags found
No related merge requests found
No preview for this file type
# Import necessary libraries
import streamlit as st
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# Function to load the dataset
def load_data(file_name="cleaned_expanded_Job_Posting.csv"):
""" Load the cleaned expanded dataset """
data_path = os.path.join(os.path.dirname(__file__), "..", "data", file_name)
df = pd.read_csv(data_path)
return df
# Main app function
def app():
st.title("Model Training & Prediction")
st.subheader("Train and Compare Scikit-Learn Models")
st.subheader("Predict Job Contract Type Based on Job Features")
# Load the dataset
# Load dataset
df = load_data()
# Ensure relevant columns for model training
if "Salary (Numeric)" not in df.columns:
st.warning("No numerical salary data available for model training.")
if "Primary Contract Type" not in df.columns:
st.warning("No job type data available for model training.")
return
# **Create a binary target variable**: "High Salary" (1) if salary >= 75,000 EUR
df["High Salary"] = df["Salary (Numeric)"].apply(lambda x: 1 if x >= 75000 else 0)
# **Feature selection**: Remove unrelated columns
X = df[["Primary Contract Type", "Location", "Category", "Seniority"]]
y = df["High Salary"]
# **One-hot encode categorical features**:
X = pd.get_dummies(X, drop_first=True)
# Drop missing values for the target column
df = df.dropna(subset=["Primary Contract Type"])
# Select relevant features (No Job Title or Description)
X = df[["Location", "Category", "Seniority"]]
y = df["Primary Contract Type"]
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# **1. Train Logistic Regression**
if st.button("Train Logistic Regression"):
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)
# Categorical encoding
categorical_features = ["Location", "Category", "Seniority"]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")
# Combine transformers
preprocessor = ColumnTransformer(
transformers=[
("cat", categorical_transformer, categorical_features)
]
)
# Create model pipelines
rf_pipeline = Pipeline([
("preprocessor", preprocessor),
("classifier", RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42))
])
st.write("### Logistic Regression Results")
st.write(f"**Accuracy:** {accuracy_score(y_test, y_pred_log_reg):.2f}")
st.write(f"**Precision:** {precision_score(y_test, y_pred_log_reg):.2f}")
st.write(f"**Recall:** {recall_score(y_test, y_pred_log_reg):.2f}")
log_reg_pipeline = Pipeline([
("preprocessor", preprocessor),
("scaler", StandardScaler(with_mean=False)),
("classifier", LogisticRegression(max_iter=500, random_state=42))
])
# **2. Train Random Forest Classifier**
# Train Random Forest Classifier
if st.button("Train Random Forest Classifier"):
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)
prec_rf = precision_score(y_test, y_pred_rf, average="weighted", zero_division=1)
rec_rf = recall_score(y_test, y_pred_rf, average="weighted")
st.session_state["rf_pipeline"] = rf_pipeline
st.session_state["model_results"] = {
"Random Forest Classifier": {
"Accuracy": acc_rf,
"Precision": prec_rf,
"Recall": rec_rf
}
}
st.success(f"Random Forest Classifier - Accuracy: {acc_rf:.2f}, Precision: {prec_rf:.2f}, Recall: {rec_rf:.2f}")
# Train Logistic Regression
if st.button("Train Logistic Regression"):
log_reg_pipeline.fit(X_train, y_train)
y_pred_log_reg = log_reg_pipeline.predict(X_test)
acc_log = accuracy_score(y_test, y_pred_log_reg)
prec_log = precision_score(y_test, y_pred_log_reg, average="weighted", zero_division=1)
rec_log = recall_score(y_test, y_pred_log_reg, average="weighted")
st.session_state["log_reg_pipeline"] = log_reg_pipeline
st.session_state["model_results"]["Logistic Regression"] = {
"Accuracy": acc_log,
"Precision": prec_log,
"Recall": rec_log
}
st.success(f"Logistic Regression - Accuracy: {acc_log:.2f}, Precision: {prec_log:.2f}, Recall: {rec_log:.2f}")
# Display model comparison
if "model_results" in st.session_state:
st.write("### Model Comparison")
comparison_df = pd.DataFrame(st.session_state["model_results"]).T
st.dataframe(comparison_df.style.highlight_max(axis=0, color="lightgreen"))
# Real-time Job Type Prediction
st.write("### Predict Job Type")
user_location = st.selectbox("Location", df["Location"].unique())
user_category = st.selectbox("Job Category", df["Category"].unique())
user_seniority = st.selectbox("Seniority Level", df["Seniority"].unique())
input_data = pd.DataFrame({
"Location": [user_location],
"Category": [user_category],
"Seniority": [user_seniority]
})
st.write("### Random Forest Classifier Results")
st.write(f"**Accuracy:** {accuracy_score(y_test, y_pred_rf):.2f}")
st.write(f"**Precision:** {precision_score(y_test, y_pred_rf):.2f}")
st.write(f"**Recall:** {recall_score(y_test, y_pred_rf):.2f}")
if st.button("Get Prediction"):
if "rf_pipeline" in st.session_state:
pred_rf = st.session_state["rf_pipeline"].predict(input_data)[0]
st.write(f"**Random Forest Classifier Prediction:** {pred_rf}")
else:
st.warning("Random Forest Classifier model not trained yet.")
st.info("Train and compare models to determine which performs best for predicting high-salary jobs.")
if "log_reg_pipeline" in st.session_state:
pred_log = st.session_state["log_reg_pipeline"].predict(input_data)[0]
st.write(f"**Logistic Regression Prediction:** {pred_log}")
else:
st.warning("Logistic Regression model not trained yet.")
os 0 → 100644
pd 0 → 100644
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment