import matplotlib.pyplot as plt import numpy as np from scipy import integrate from scipy import stats import pandas as pd from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.linear_model import SGDRegressor from sklearn.preprocessing import StandardScaler columns = ["Material", "Density", "Radius", "Mass", "Temperature", "Pressure", "Height", "Time"] columnsNoMaterial = ["Density", "Radius", "Mass", "Temperature", "Pressure", "Height", "Time"] units = ["", "kg/m^3", "m", "kg", "K", "Pa", "m", "s"] materials = ["magnesium", "polycarbonate", "silica", "zinc_oxide", "silicon_carbide", "titanium", "iron"] radii = [0.005, 0.01, 0.015, 0.02, 0.025] def getData(file): columns = ["Material", "Density", "Radius", "Mass", "Temperature", "Pressure", "Height", "Time"] data = pd.read_csv(file, sep=',', names=columns, skiprows=9, on_bad_lines='skip') data["Density"] = pd.to_numeric(data["Density"], errors='coerce') data["Temperature"] = pd.to_numeric(data["Temperature"], errors='coerce') data["Time"] = pd.to_numeric(data["Time"], errors='coerce') for column in columns: if column == "Material": for i in data.index: if data[column][i] not in materials: data.drop(i, inplace=True) else: for i in data.index: if data[column][i] < 0: data.loc[i, column] = -data.loc[i, column] data.dropna(inplace=True) return data def columnStats(column, units): min = df[column].min() max = df[column].max() mean = df[column].mean() stdDev = df[column].std() print(f'Statistics for {column}') print(f'Minimum: {min}{units}') print(f'Maximum: {max}{units}') print(f'Mean: {mean}{units}') print(f'Standard Deviation: {stdDev}{units}') print() df = getData('exercise3data.csv') ####Part 1 def part1(): for i in range(len(columns)): if columns[i] == "Material": continue else: columnStats(columns[i], units[i]) for material in materials: materialDf = df[df["Material"] == material] for radius in radii: radiusDf = materialDf[materialDf["Radius"] == radius] plt.scatter(radiusDf["Height"], radiusDf["Time"], label=f'Radius {radius}m') plt.xlabel("Drop Height/m") plt.ylabel("Fall Time/s") plt.title(f'Material: {material}') plt.legend() plt.show() ####Part 2 def part2(): dfNoMaterial = df.drop("Material", axis=1) corrMatrix = dfNoMaterial.corr(method='pearson') fig, ax = plt.subplots() im = ax.imshow(corrMatrix, cmap="gnuplot", vmin=-1, vmax=1) ax.set_xticks(range(len(columnsNoMaterial)), labels=columnsNoMaterial) ax.set_yticks(range(len(columnsNoMaterial)), labels=columnsNoMaterial) for i in range(len(columnsNoMaterial)): for j in range(len(columnsNoMaterial)): text = ax.text(j, i, round(corrMatrix[columnsNoMaterial[i]][columnsNoMaterial[j]], 2), ha="center", va="center", color="w") fig.colorbar(im) fig.tight_layout() plt.show() ####Part 3 def part3(): features = df[["Density", "Radius", "Mass", "Temperature", "Pressure", "Height"]] targets = df["Time"] linearReg = LinearRegression() linearFit = linearReg.fit(features, targets) for i in range(len(linearFit.feature_names_in_)): print(f'The coefficient of {linearFit.feature_names_in_[i]} is {linearFit.coef_[i]} {units[i+1]}') ironDf = df[df["Material"] == "iron"] def fitByMeans(density, radius, mass, temp, pressure, height): coefs = linearFit.coef_ time = linearFit.intercept_+(density*coefs[0])+(radius*coefs[1])+(mass*coefs[2])+(temp*coefs[3])+(pressure*coefs[4])+(height*coefs[5]) return time def predict(): for radius in radii: radiusDf = ironDf[ironDf["Radius"] == radius] plt.scatter(radiusDf["Height"], radiusDf["Time"],label="Experimental data") radiusFeatures = radiusDf[["Density", "Radius", "Mass", "Temperature", "Pressure", "Height"]] plt.scatter(radiusDf["Height"], linearReg.predict(radiusFeatures),label="Predicted data") heightBounds = [radiusDf["Height"].min(),radiusDf["Height"].max()] linearByMeans = [fitByMeans(radiusDf["Density"].mean(),radiusDf["Radius"].mean(),radiusDf["Mass"].mean(),radiusDf["Temperature"].mean(),radiusDf["Pressure"].mean(),radiusDf["Height"].min()),fitByMeans(radiusDf["Density"].mean(),radiusDf["Radius"].mean(),radiusDf["Mass"].mean(),radiusDf["Temperature"].mean(),radiusDf["Pressure"].mean(),radiusDf["Height"].max())] plt.plot(heightBounds,linearByMeans,label="Fit Using Means") plt.xlabel("Drop Height/m") plt.ylabel("Fall Time/s") plt.legend() plt.title(f'Iron data and predictions for radius of {radius}m') plt.show() predict() trainData, testData = train_test_split(ironDf, test_size=0.1) features = trainData[["Density", "Radius", "Mass", "Temperature", "Pressure", "Height"]] targets = trainData["Time"] trainLinearReg = LinearRegression() trainLinearFit = trainLinearReg.fit(features, targets) def trueVpred(): for radius in radii: radiusDf = testData[testData["Radius"] == radius] plt.scatter(radiusDf["Height"], radiusDf["Time"]) trueR2 = (stats.linregress(radiusDf["Height"], radiusDf["Time"]).rvalue)**2 radiusFeatures = radiusDf[["Density", "Radius", "Mass", "Temperature", "Pressure", "Height"]] plt.scatter(radiusDf["Height"], trainLinearReg.predict(radiusFeatures),label="Predicted data") predR2 = (stats.linregress(radiusDf["Height"], trainLinearReg.predict(radiusFeatures)).rvalue)**2 print(f'For radius of {radius}m, the true R^2 value is {trueR2} and the predicted R^2 value is {predR2}') plt.show() trueVpred() def calcResiduals(): residualsFeatures = testData[["Density", "Radius", "Mass", "Temperature", "Pressure", "Height"]] residuals = testData["Time"] - trainLinearReg.predict(residualsFeatures) plt.scatter(testData["Radius"], residuals) plt.show() calcResiduals() def part4(): reg = SGDRegressor() regSamples = df[["Density", "Radius", "Mass", "Temperature", "Pressure", "Height"]] regTargets = df[["Time"]] reg.fit(regSamples, regTargets) print(f'The unscaled R^2 value is: {reg.score(regSamples, regTargets)}') scaler = StandardScaler() scaledSamples = scaler.fit_transform(regSamples, regTargets) scaledReg = SGDRegressor() scaledReg.fit(scaledSamples, regTargets) print(f'The scaled R^2 value is: {scaledReg.score(scaledSamples, regTargets)}') deScaledCoefs = scaledReg.coef_ / scaler.scale_ for i in range(6): print(f'The coefficient of {columns[i+1]} is {deScaledCoefs[i]} {units[i+1]}') huberReg = SGDRegressor(loss="huber") huberReg.fit(scaledSamples, regTargets) print(f'The scaled R^2 value using the huber loss function is: {huberReg.score(scaledSamples, regTargets)}') deScaledCoefs = huberReg.coef_ / scaler.scale_ for i in range(6): print(f'The coefficient of {columns[i+1]} using the huber loss function is {deScaledCoefs[i]} {units[i+1]}') part1() part2() part3() part4()