In [2]:
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import json
import math
from sklearn.model_selection import train_test_split

### 1.  Read Dataset and remove NAN values

In [3]:
diamonds = pd.read_csv('diamonds.csv')

diamonds.dropna(inplace=True)
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


### 2.  Mapping non-numerical values to numerical ones

**Cut Quality**
| Cut        | Mapping |
|------------|------|
| Ideal      | 0    |
| Premium    | 1    |
| Good       | 2    |
| Very Good  | 3    |
| Fair       | 4    |

**Color Grade**
| Color | Mapping |
|-------|------|
| E     | 0    |
| I     | 1    |
| J     | 2    |
| H     | 3    |
| F     | 4    |
| G     | 5    |
| D     | 6    |

**Clarity Grade**
| Clarity | Mapping |
|---------|------|
| SI2     | 0    |
| SI1     | 1    |
| VS1     | 2    |
| VS2     | 3    |
| VVS2    | 4    |
| VVS1    | 5    |
| I1      | 6    |
| IF      | 7    |

In [4]:
cut_mapping = {cut: i for i, cut in enumerate(diamonds['cut'].unique())}
color_mapping = {color: i for i, color in enumerate(diamonds['color'].unique())}
clarity_mapping = {clarity: i for i, clarity in enumerate(diamonds['clarity'].unique())}

### 3.  Replace the non-numerical columns with their numerical mappings

In [5]:
diamonds_numerical = diamonds.copy()
diamonds_numerical['cut'] = diamonds['cut'].map(cut_mapping)
diamonds_numerical['color'] = diamonds['color'].map(color_mapping)
diamonds_numerical['clarity'] = diamonds['clarity'].map(clarity_mapping)

In [6]:
diamonds_numerical.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,0,0,0,61.5,55.0,326,3.95,3.98,2.43
1,0.21,1,0,1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,2,0,2,56.9,65.0,327,4.05,4.07,2.31
3,0.29,1,1,3,62.4,58.0,334,4.2,4.23,2.63
4,0.31,2,2,0,63.3,58.0,335,4.34,4.35,2.75


In [7]:
X = diamonds_numerical.drop('price', axis = 1)
y = diamonds_numerical['price']
X.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.23,0,0,0,61.5,55.0,3.95,3.98,2.43
1,0.21,1,0,1,59.8,61.0,3.89,3.84,2.31
2,0.23,2,0,2,56.9,65.0,4.05,4.07,2.31
3,0.29,1,1,3,62.4,58.0,4.2,4.23,2.63
4,0.31,2,2,0,63.3,58.0,4.34,4.35,2.75


In [8]:
y.head()

0    326
1    326
2    327
3    334
4    335
Name: price, dtype: int64

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
# Train the models
model.fit(X_train, y_train)

In [11]:
from sklearn.metrics import mean_squared_error, r2_score

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

Mean Squared Error: 2029299.0804379764
R^2 Score: 0.8723456077895886


In [12]:
from xgboost import XGBRegressor
model2 = XGBRegressor(n_estimators=100,random_state=42)
model2.fit(X_train, y_train)

In [13]:
y_pred2 = model2.predict(X_test)
mse2 = mean_squared_error(y_test, y_pred2)
r22 = r2_score(y_test, y_pred2)
print("Mean Squared Error:", mse2)
print("R^2 Score:", r22)

Mean Squared Error: 314752.8223018785
R^2 Score: 0.9802002668730396


In [14]:
from sklearn.neural_network import MLPRegressor
model4 = MLPRegressor(random_state=42, max_iter=500)
model4.fit(X_train, y_train)



In [15]:
from sklearn.metrics import mean_squared_error, r2_score

# Predict on the test set
y_pred = model.predict(X_test)
y_pred2 = model2.predict(X_test)
y_pred4 = model4.predict(X_test)
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mse2 = mean_squared_error(y_test, y_pred2)
mse4 = mean_squared_error(y_test, y_pred4)
r2 = r2_score(y_test, y_pred)
r22 = r2_score(y_test, y_pred2)
r24 = r2_score(y_test, y_pred4)
print("Mean Squared Error:", mse)
print("MSE 2 ", mse2)
print("MSE 4 ", mse4)
print("R^2 Score:", r2)
print("R^2 Score:", r22)
print("R^2 Score:", r24)

Mean Squared Error: 2029299.0804379764
MSE 2  314752.8223018785
MSE 4  1320105.4212450413
R^2 Score: 0.8723456077895886
R^2 Score: 0.9802002668730396
R^2 Score: 0.9169579009682819


In [16]:
test_diamond = [[
    1.00,
    2,
    1,
    1,
    60.4,
    55,
    4.20,
    4.3,
    2.27
]]
model2.predict(test_diamond)


array([3461.6711], dtype=float32)