Uni-Computing/Exercise 3/exercise3.py

import matplotlib.pyplot as plt
import numpy as np
from scipy import integrate
from scipy import stats
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler

"""
The below section establishes some initial variables which will remain consistent throughout the program,
such as the columns in the csv, the units for each, all the materials tested and the different radii tested
"""

columns = ["Material", "Density", "Radius", "Mass", "Temperature", "Pressure", "Height", "Time"]
columnsNoMaterial = ["Density", "Radius", "Mass", "Temperature", "Pressure", "Height", "Time"]
units = ["", "kg/m^3", "m", "kg", "K", "Pa", "m", "s"]
materials = ["magnesium", "polycarbonate", "silica", "zinc_oxide", "silicon_carbide", "titanium", "iron"]
radii = [0.005, 0.01, 0.015, 0.02, 0.025]

"""
This function reads the csv file and imports is into a pandas dataframe, with the correct names for each column
it then applies some corrections to the data, first making sure all the data that should be numeric is numeric
which converts any non numerical data to NaN, which can be filtered out
the function then deletes any lines which have a material not in the 'materials' list, and then converts any negative values to be positive
finally, the function removes any rows containing NaN, then returns the cleaned dataframe
"""

def getData(file):
    columns = ["Material", "Density", "Radius", "Mass", "Temperature", "Pressure", "Height", "Time"]
    data = pd.read_csv(file, sep=',', names=columns, skiprows=9, on_bad_lines='skip')

    data["Density"] = pd.to_numeric(data["Density"], errors='coerce')
    data["Temperature"] = pd.to_numeric(data["Temperature"], errors='coerce')
    data["Time"] = pd.to_numeric(data["Time"], errors='coerce')

    for column in columns:
        if column == "Material":
            for i in data.index:
                if data[column][i] not in materials:
                    data.drop(i, inplace=True)
        else:
            for i in data.index:
                    if data[column][i] < 0:
                        data.loc[i, column] = -data.loc[i, column]

    data.dropna(inplace=True)
    return data

"""
This function takes the column name aand its units as input, then outputs statistics of the column, including min, max, mean ans standard deviation,
and then prints them with the relvant units.
"""

def columnStats(column, units):
    min = df[column].min()
    max = df[column].max()
    mean = df[column].mean()
    stdDev = df[column].std()

    print(f'Statistics for {column}')
    print(f'Minimum: {min}{units}')
    print(f'Maximum: {max}{units}')
    print(f'Mean: {mean}{units}')
    print(f'Standard Deviation: {stdDev}{units}')
    print()

df = getData('exercise3data.csv')

####Part 1

"""
This function performs all the operations for part 1
First, it iterates through the columns, skkipping over material, and then uses the previous columnStats function to output the statistics for each column
Then, it iterates through the list of materials, and for each one, filters the data frame to just the rows with that material.
For each material, it then iterates through the list of radii, again filtering the dataframe to contain just rows with that radius, and then plots the remaining rows.
Once every radius has been plotted, the plot is then shown, with the correct labels, title and legend.
"""

def part1():
    for i in range(len(columns)):
        if columns[i] == "Material":
            continue
        else:
            columnStats(columns[i], units[i])

    for material in materials:
        materialDf = df[df["Material"] == material]
        for radius in radii:
            radiusDf = materialDf[materialDf["Radius"] == radius]
            plt.scatter(radiusDf["Height"], radiusDf["Time"], label=f'Radius {radius}m')

        plt.xlabel("Drop Height/m")
        plt.ylabel("Fall Time/s")
        plt.title(f'Material: {material}')
        plt.legend()
        plt.show()

####Part 2

"""
This function performs all the operations for part 2.
First, it removes the Material column from the dataframe, as this interferes with the subsequent operations
It then uses the .corr() function to calculate thye correlations between the various parameters, and assigns this to a variable
A plot is then made of this matrix, with bounds set to -1 to 1, and then the function iterates through earch tile on the plot and labels it with the relevant value
Finally, a colour bar legend is added to the plot and the plot is given a title, and is then shown
"""

def part2():
    dfNoMaterial = df.drop("Material", axis=1)
    corrMatrix = dfNoMaterial.corr(method='pearson')

    fig, ax = plt.subplots()
    im = ax.imshow(corrMatrix, cmap="gnuplot", vmin=-1, vmax=1)

    ax.set_xticks(range(len(columnsNoMaterial)), labels=columnsNoMaterial)
    ax.set_yticks(range(len(columnsNoMaterial)), labels=columnsNoMaterial)

    for i in range(len(columnsNoMaterial)):
        for j in range(len(columnsNoMaterial)):
            text = ax.text(j, i, round(corrMatrix[columnsNoMaterial[i]][columnsNoMaterial[j]], 2),
                           ha="center", va="center", color="w")

    fig.colorbar(im)
    fig.tight_layout()
    fig.suptitle("Correlation between each parameter")
    plt.show()

####Part 3

"""
This function performs all the operations for section 3
First, filtered dataframes are made, one with only the features affecting drop time, and one with just the drop time
A linear regression of these values is then calculated using the sklearn function, and the coefficients are printed with their relevant units
The dataframe is then filtered to contain only a single material, iron
A function is then defined that takes values for density, radius, mass, temp, pressure and height as input, and then uses the coefficiients calculated by the linear fit
to calculate and return a value for fall time.
A function to predict the fall times using the linear fit is then defined.
It first filters the dataframe by radius, then plots the ex,perimental, or 'true' data as a scatter plot
The .predict function is then used, taking the dataframe of features as an input, to plot the predictions of each drop time based on fall distance.
The fitByMeans function is then used, by passing the mean of each column and two values of drop height, and this data is used to plot a straigfht line of best fit
The axis are given labels, and the plot is then shown.

The data is then split randomly into 90% fo training data and 10% for test data.
Again, the LinearRegression function is used to calculate a linear regression based thi time of the random smple of test data.
For each radius, the dataframe is first filtered, and the true values of fall time are plotted, and their R^2 value is calculated.
The .predict function is again used to calculate the predicted values for fall time based off the training set, and this is plotted on the same graph, and its R^2 value is also calculated
The R^2 values are then printed, and the plot is shown.

Finaly, a function to plot the residuals between the true and predicted data is defined.
It finds the differnce between each true value for time and its predicted one, and then plots these residuals against radius.
"""

def part3():
    features = df[["Density", "Radius", "Mass", "Temperature", "Pressure", "Height"]]
    targets = df["Time"]

    linearReg = LinearRegression()
    linearFit = linearReg.fit(features, targets)

    for i in range(len(linearFit.feature_names_in_)):
        print(f'The coefficient of {linearFit.feature_names_in_[i]} is {linearFit.coef_[i]} {units[i+1]}')

    ironDf = df[df["Material"] == "iron"]

    def fitByMeans(density, radius, mass, temp, pressure, height):
        coefs = linearFit.coef_
        time = linearFit.intercept_+(density*coefs[0])+(radius*coefs[1])+(mass*coefs[2])+(temp*coefs[3])+(pressure*coefs[4])+(height*coefs[5])
        return time

    def predict():
        for radius in radii:
            radiusDf = ironDf[ironDf["Radius"] == radius]
            plt.scatter(radiusDf["Height"], radiusDf["Time"],label="Experimental data")
            radiusFeatures = radiusDf[["Density", "Radius", "Mass", "Temperature", "Pressure", "Height"]]
            plt.scatter(radiusDf["Height"], linearReg.predict(radiusFeatures),label="Predicted data")
            heightBounds = [radiusDf["Height"].min(),radiusDf["Height"].max()]
            linearByMeans = [fitByMeans(radiusDf["Density"].mean(),radiusDf["Radius"].mean(),radiusDf["Mass"].mean(),radiusDf["Temperature"].mean(),radiusDf["Pressure"].mean(),radiusDf["Height"].min()),fitByMeans(radiusDf["Density"].mean(),radiusDf["Radius"].mean(),radiusDf["Mass"].mean(),radiusDf["Temperature"].mean(),radiusDf["Pressure"].mean(),radiusDf["Height"].max())]
            plt.plot(heightBounds,linearByMeans,label="Fit Using Means")
            plt.xlabel("Drop Height/m")
            plt.ylabel("Fall Time/s")
            plt.legend()
            plt.title(f'Iron data and predictions for radius of {radius}m')
            plt.show()

    predict()

    trainData, testData = train_test_split(ironDf, test_size=0.1)

    features = trainData[["Density", "Radius", "Mass", "Temperature", "Pressure", "Height"]]
    targets = trainData["Time"]

    trainLinearReg = LinearRegression()
    trainLinearFit = trainLinearReg.fit(features, targets)

    def trueVpred():
        for radius in radii:
            radiusDf = testData[testData["Radius"] == radius]
            plt.scatter(radiusDf["Height"], radiusDf["Time"])
            trueR2 = (stats.linregress(radiusDf["Height"], radiusDf["Time"]).rvalue)**2
            radiusFeatures = radiusDf[["Density", "Radius", "Mass", "Temperature", "Pressure", "Height"]]
            plt.scatter(radiusDf["Height"], trainLinearReg.predict(radiusFeatures),label="Predicted data")
            predR2 = (stats.linregress(radiusDf["Height"], trainLinearReg.predict(radiusFeatures)).rvalue)**2
            print(f'For radius of {radius}m, the true R^2 value is {trueR2} and the predicted R^2 value is {predR2}')
            plt.show()

    trueVpred()

    def calcResiduals():
        residualsFeatures = testData[["Density", "Radius", "Mass", "Temperature", "Pressure", "Height"]]
        residuals = testData["Time"] - trainLinearReg.predict(residualsFeatures)

        plt.scatter(testData["Radius"], residuals)
        plt.show()

    calcResiduals()

"""
This function performs all the operations for section 4
First, the dataframe is again split up into the columns for the features affecting drop time and drop time.
The SDG regressor function is then used to calculate an unscaled linear fit of the data, and its R^2 value is calculated and printed.
The data is then scaled using the StandardScaler function, and the scaled features are saved to a variable.
The linear regression is then calculated again, its R^2 value is calculated and prnted, and the coefficients for each fearture are listed along with their relevant units
This process is then repeated again using the huber loss function rather than the least squares function.
"""

def part4():
    reg = SGDRegressor()

    regSamples = df[["Density", "Radius", "Mass", "Temperature", "Pressure", "Height"]]
    regTargets = df[["Time"]]
    reg.fit(regSamples, regTargets)
    print(f'The unscaled R^2 value is: {reg.score(regSamples, regTargets)}')

    scaler = StandardScaler()
    scaledSamples = scaler.fit_transform(regSamples, regTargets)

    scaledReg = SGDRegressor()

    scaledReg.fit(scaledSamples, regTargets)
    print(f'The scaled R^2 value is: {scaledReg.score(scaledSamples, regTargets)}')
    deScaledCoefs = scaledReg.coef_ / scaler.scale_

    for i in range(6):
        print(f'The coefficient of {columns[i+1]} is {deScaledCoefs[i]} {units[i+1]}')

    huberReg = SGDRegressor(loss="huber")

    huberReg.fit(scaledSamples, regTargets)
    print(f'The scaled R^2 value using the huber loss function is: {huberReg.score(scaledSamples, regTargets)}')
    deScaledCoefs = huberReg.coef_ / scaler.scale_

    for i in range(6):
        print(f'The coefficient of {columns[i+1]} using the huber loss function is {deScaledCoefs[i]} {units[i+1]}')

part1()
part2()
part3()
part4()