260 lines
12 KiB
Python
260 lines
12 KiB
Python
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
from scipy import integrate
|
|
from scipy import stats
|
|
import pandas as pd
|
|
from sklearn.linear_model import LinearRegression
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.linear_model import SGDRegressor
|
|
from sklearn.preprocessing import StandardScaler
|
|
|
|
"""
|
|
The below section establishes some initial variables which will remain consistent throughout the program,
|
|
such as the columns in the csv, the units for each, all the materials tested and the different radii tested
|
|
"""
|
|
|
|
columns = ["Material", "Density", "Radius", "Mass", "Temperature", "Pressure", "Height", "Time"]
|
|
columnsNoMaterial = ["Density", "Radius", "Mass", "Temperature", "Pressure", "Height", "Time"]
|
|
units = ["", "kg/m^3", "m", "kg", "K", "Pa", "m", "s"]
|
|
materials = ["magnesium", "polycarbonate", "silica", "zinc_oxide", "silicon_carbide", "titanium", "iron"]
|
|
radii = [0.005, 0.01, 0.015, 0.02, 0.025]
|
|
|
|
"""
|
|
This function reads the csv file and imports is into a pandas dataframe, with the correct names for each column
|
|
it then applies some corrections to the data, first making sure all the data that should be numeric is numeric
|
|
which converts any non numerical data to NaN, which can be filtered out
|
|
the function then deletes any lines which have a material not in the 'materials' list, and then converts any negative values to be positive
|
|
finally, the function removes any rows containing NaN, then returns the cleaned dataframe
|
|
"""
|
|
|
|
def getData(file):
|
|
columns = ["Material", "Density", "Radius", "Mass", "Temperature", "Pressure", "Height", "Time"]
|
|
data = pd.read_csv(file, sep=',', names=columns, skiprows=9, on_bad_lines='skip')
|
|
|
|
data["Density"] = pd.to_numeric(data["Density"], errors='coerce')
|
|
data["Temperature"] = pd.to_numeric(data["Temperature"], errors='coerce')
|
|
data["Time"] = pd.to_numeric(data["Time"], errors='coerce')
|
|
|
|
for column in columns:
|
|
if column == "Material":
|
|
for i in data.index:
|
|
if data[column][i] not in materials:
|
|
data.drop(i, inplace=True)
|
|
else:
|
|
for i in data.index:
|
|
if data[column][i] < 0:
|
|
data.loc[i, column] = -data.loc[i, column]
|
|
|
|
data.dropna(inplace=True)
|
|
return data
|
|
|
|
"""
|
|
This function takes the column name aand its units as input, then outputs statistics of the column, including min, max, mean ans standard deviation,
|
|
and then prints them with the relvant units.
|
|
"""
|
|
|
|
def columnStats(column, units):
|
|
min = df[column].min()
|
|
max = df[column].max()
|
|
mean = df[column].mean()
|
|
stdDev = df[column].std()
|
|
|
|
print(f'Statistics for {column}')
|
|
print(f'Minimum: {min}{units}')
|
|
print(f'Maximum: {max}{units}')
|
|
print(f'Mean: {mean}{units}')
|
|
print(f'Standard Deviation: {stdDev}{units}')
|
|
print()
|
|
|
|
df = getData('exercise3data.csv')
|
|
|
|
####Part 1
|
|
|
|
"""
|
|
This function performs all the operations for part 1
|
|
First, it iterates through the columns, skkipping over material, and then uses the previous columnStats function to output the statistics for each column
|
|
Then, it iterates through the list of materials, and for each one, filters the data frame to just the rows with that material.
|
|
For each material, it then iterates through the list of radii, again filtering the dataframe to contain just rows with that radius, and then plots the remaining rows.
|
|
Once every radius has been plotted, the plot is then shown, with the correct labels, title and legend.
|
|
"""
|
|
|
|
def part1():
|
|
for i in range(len(columns)):
|
|
if columns[i] == "Material":
|
|
continue
|
|
else:
|
|
columnStats(columns[i], units[i])
|
|
|
|
for material in materials:
|
|
materialDf = df[df["Material"] == material]
|
|
for radius in radii:
|
|
radiusDf = materialDf[materialDf["Radius"] == radius]
|
|
plt.scatter(radiusDf["Height"], radiusDf["Time"], label=f'Radius {radius}m')
|
|
|
|
plt.xlabel("Drop Height/m")
|
|
plt.ylabel("Fall Time/s")
|
|
plt.title(f'Material: {material}')
|
|
plt.legend()
|
|
plt.show()
|
|
|
|
####Part 2
|
|
|
|
"""
|
|
This function performs all the operations for part 2.
|
|
First, it removes the Material column from the dataframe, as this interferes with the subsequent operations
|
|
It then uses the .corr() function to calculate thye correlations between the various parameters, and assigns this to a variable
|
|
A plot is then made of this matrix, with bounds set to -1 to 1, and then the function iterates through earch tile on the plot and labels it with the relevant value
|
|
Finally, a colour bar legend is added to the plot and the plot is given a title, and is then shown
|
|
"""
|
|
|
|
def part2():
|
|
dfNoMaterial = df.drop("Material", axis=1)
|
|
corrMatrix = dfNoMaterial.corr(method='pearson')
|
|
|
|
fig, ax = plt.subplots()
|
|
im = ax.imshow(corrMatrix, cmap="gnuplot", vmin=-1, vmax=1)
|
|
|
|
ax.set_xticks(range(len(columnsNoMaterial)), labels=columnsNoMaterial)
|
|
ax.set_yticks(range(len(columnsNoMaterial)), labels=columnsNoMaterial)
|
|
|
|
for i in range(len(columnsNoMaterial)):
|
|
for j in range(len(columnsNoMaterial)):
|
|
text = ax.text(j, i, round(corrMatrix[columnsNoMaterial[i]][columnsNoMaterial[j]], 2),
|
|
ha="center", va="center", color="w")
|
|
|
|
fig.colorbar(im)
|
|
fig.tight_layout()
|
|
fig.suptitle("Correlation between each parameter")
|
|
plt.show()
|
|
|
|
####Part 3
|
|
|
|
"""
|
|
This function performs all the operations for section 3
|
|
First, filtered dataframes are made, one with only the features affecting drop time, and one with just the drop time
|
|
A linear regression of these values is then calculated using the sklearn function, and the coefficients are printed with their relevant units
|
|
The dataframe is then filtered to contain only a single material, iron
|
|
A function is then defined that takes values for density, radius, mass, temp, pressure and height as input, and then uses the coefficiients calculated by the linear fit
|
|
to calculate and return a value for fall time.
|
|
A function to predict the fall times using the linear fit is then defined.
|
|
It first filters the dataframe by radius, then plots the ex,perimental, or 'true' data as a scatter plot
|
|
The .predict function is then used, taking the dataframe of features as an input, to plot the predictions of each drop time based on fall distance.
|
|
The fitByMeans function is then used, by passing the mean of each column and two values of drop height, and this data is used to plot a straigfht line of best fit
|
|
The axis are given labels, and the plot is then shown.
|
|
|
|
The data is then split randomly into 90% fo training data and 10% for test data.
|
|
Again, the LinearRegression function is used to calculate a linear regression based thi time of the random smple of test data.
|
|
For each radius, the dataframe is first filtered, and the true values of fall time are plotted, and their R^2 value is calculated.
|
|
The .predict function is again used to calculate the predicted values for fall time based off the training set, and this is plotted on the same graph, and its R^2 value is also calculated
|
|
The R^2 values are then printed, and the plot is shown.
|
|
|
|
Finaly, a function to plot the residuals between the true and predicted data is defined.
|
|
It finds the differnce between each true value for time and its predicted one, and then plots these residuals against radius.
|
|
"""
|
|
|
|
def part3():
|
|
features = df[["Density", "Radius", "Mass", "Temperature", "Pressure", "Height"]]
|
|
targets = df["Time"]
|
|
|
|
linearReg = LinearRegression()
|
|
linearFit = linearReg.fit(features, targets)
|
|
|
|
for i in range(len(linearFit.feature_names_in_)):
|
|
print(f'The coefficient of {linearFit.feature_names_in_[i]} is {linearFit.coef_[i]} {units[i+1]}')
|
|
|
|
ironDf = df[df["Material"] == "iron"]
|
|
|
|
def fitByMeans(density, radius, mass, temp, pressure, height):
|
|
coefs = linearFit.coef_
|
|
time = linearFit.intercept_+(density*coefs[0])+(radius*coefs[1])+(mass*coefs[2])+(temp*coefs[3])+(pressure*coefs[4])+(height*coefs[5])
|
|
return time
|
|
|
|
def predict():
|
|
for radius in radii:
|
|
radiusDf = ironDf[ironDf["Radius"] == radius]
|
|
plt.scatter(radiusDf["Height"], radiusDf["Time"],label="Experimental data")
|
|
radiusFeatures = radiusDf[["Density", "Radius", "Mass", "Temperature", "Pressure", "Height"]]
|
|
plt.scatter(radiusDf["Height"], linearReg.predict(radiusFeatures),label="Predicted data")
|
|
heightBounds = [radiusDf["Height"].min(),radiusDf["Height"].max()]
|
|
linearByMeans = [fitByMeans(radiusDf["Density"].mean(),radiusDf["Radius"].mean(),radiusDf["Mass"].mean(),radiusDf["Temperature"].mean(),radiusDf["Pressure"].mean(),radiusDf["Height"].min()),fitByMeans(radiusDf["Density"].mean(),radiusDf["Radius"].mean(),radiusDf["Mass"].mean(),radiusDf["Temperature"].mean(),radiusDf["Pressure"].mean(),radiusDf["Height"].max())]
|
|
plt.plot(heightBounds,linearByMeans,label="Fit Using Means")
|
|
plt.xlabel("Drop Height/m")
|
|
plt.ylabel("Fall Time/s")
|
|
plt.legend()
|
|
plt.title(f'Iron data and predictions for radius of {radius}m')
|
|
plt.show()
|
|
|
|
predict()
|
|
|
|
trainData, testData = train_test_split(ironDf, test_size=0.1)
|
|
|
|
features = trainData[["Density", "Radius", "Mass", "Temperature", "Pressure", "Height"]]
|
|
targets = trainData["Time"]
|
|
|
|
trainLinearReg = LinearRegression()
|
|
trainLinearFit = trainLinearReg.fit(features, targets)
|
|
|
|
def trueVpred():
|
|
for radius in radii:
|
|
radiusDf = testData[testData["Radius"] == radius]
|
|
plt.scatter(radiusDf["Height"], radiusDf["Time"])
|
|
trueR2 = (stats.linregress(radiusDf["Height"], radiusDf["Time"]).rvalue)**2
|
|
radiusFeatures = radiusDf[["Density", "Radius", "Mass", "Temperature", "Pressure", "Height"]]
|
|
plt.scatter(radiusDf["Height"], trainLinearReg.predict(radiusFeatures),label="Predicted data")
|
|
predR2 = (stats.linregress(radiusDf["Height"], trainLinearReg.predict(radiusFeatures)).rvalue)**2
|
|
print(f'For radius of {radius}m, the true R^2 value is {trueR2} and the predicted R^2 value is {predR2}')
|
|
plt.show()
|
|
|
|
trueVpred()
|
|
|
|
def calcResiduals():
|
|
residualsFeatures = testData[["Density", "Radius", "Mass", "Temperature", "Pressure", "Height"]]
|
|
residuals = testData["Time"] - trainLinearReg.predict(residualsFeatures)
|
|
|
|
plt.scatter(testData["Radius"], residuals)
|
|
plt.show()
|
|
|
|
calcResiduals()
|
|
|
|
"""
|
|
This function performs all the operations for section 4
|
|
First, the dataframe is again split up into the columns for the features affecting drop time and drop time.
|
|
The SDG regressor function is then used to calculate an unscaled linear fit of the data, and its R^2 value is calculated and printed.
|
|
The data is then scaled using the StandardScaler function, and the scaled features are saved to a variable.
|
|
The linear regression is then calculated again, its R^2 value is calculated and prnted, and the coefficients for each fearture are listed along with their relevant units
|
|
This process is then repeated again using the huber loss function rather than the least squares function.
|
|
"""
|
|
|
|
def part4():
|
|
reg = SGDRegressor()
|
|
|
|
regSamples = df[["Density", "Radius", "Mass", "Temperature", "Pressure", "Height"]]
|
|
regTargets = df[["Time"]]
|
|
reg.fit(regSamples, regTargets)
|
|
print(f'The unscaled R^2 value is: {reg.score(regSamples, regTargets)}')
|
|
|
|
scaler = StandardScaler()
|
|
scaledSamples = scaler.fit_transform(regSamples, regTargets)
|
|
|
|
scaledReg = SGDRegressor()
|
|
|
|
scaledReg.fit(scaledSamples, regTargets)
|
|
print(f'The scaled R^2 value is: {scaledReg.score(scaledSamples, regTargets)}')
|
|
deScaledCoefs = scaledReg.coef_ / scaler.scale_
|
|
|
|
for i in range(6):
|
|
print(f'The coefficient of {columns[i+1]} is {deScaledCoefs[i]} {units[i+1]}')
|
|
|
|
huberReg = SGDRegressor(loss="huber")
|
|
|
|
huberReg.fit(scaledSamples, regTargets)
|
|
print(f'The scaled R^2 value using the huber loss function is: {huberReg.score(scaledSamples, regTargets)}')
|
|
deScaledCoefs = huberReg.coef_ / scaler.scale_
|
|
|
|
for i in range(6):
|
|
print(f'The coefficient of {columns[i+1]} using the huber loss function is {deScaledCoefs[i]} {units[i+1]}')
|
|
|
|
part1()
|
|
part2()
|
|
part3()
|
|
part4()
|