Finish Exercise 3

2026-02-23 13:51:08 +00:00 · 2026-02-23 13:51:08 +00:00 · 129f37d139
commit 129f37d139
parent 4c8a1d0cd0
8 changed files with 250 additions and 0 deletions
--- a/3/exercise3.py
+++ b/3/exercise3.py
@ -8,12 +8,25 @@ from sklearn.model_selection import train_test_split
 from sklearn.linear_model import SGDRegressor
 from sklearn.preprocessing import StandardScaler

+"""
+The below section establishes some initial variables which will remain consistent throughout the program,
+such as the columns in the csv, the units for each, all the materials tested and the different radii tested
+"""
+
 columns = ["Material", "Density", "Radius", "Mass", "Temperature", "Pressure", "Height", "Time"]
 columnsNoMaterial = ["Density", "Radius", "Mass", "Temperature", "Pressure", "Height", "Time"]
 units = ["", "kg/m^3", "m", "kg", "K", "Pa", "m", "s"]
 materials = ["magnesium", "polycarbonate", "silica", "zinc_oxide", "silicon_carbide", "titanium", "iron"]
 radii = [0.005, 0.01, 0.015, 0.02, 0.025]

+"""
+This function reads the csv file and imports is into a pandas dataframe, with the correct names for each column
+it then applies some corrections to the data, first making sure all the data that should be numeric is numeric
+which converts any non numerical data to NaN, which can be filtered out
+the function then deletes any lines which have a material not in the 'materials' list, and then converts any negative values to be positive
+finally, the function removes any rows containing NaN, then returns the cleaned dataframe
+"""
+
 def getData(file):
    columns = ["Material", "Density", "Radius", "Mass", "Temperature", "Pressure", "Height", "Time"]
    data = pd.read_csv(file, sep=',', names=columns, skiprows=9, on_bad_lines='skip')
@ -35,6 +48,11 @@ def getData(file):
    data.dropna(inplace=True)
    return data

+"""
+This function takes the column name aand its units as input, then outputs statistics of the column, including min, max, mean ans standard deviation,
+and then prints them with the relvant units.
+"""
+
 def columnStats(column, units):
    min = df[column].min()
    max = df[column].max()
@ -52,6 +70,14 @@ df = getData('exercise3data.csv')

 ####Part 1

+"""
+This function performs all the operations for part 1
+First, it iterates through the columns, skkipping over material, and then uses the previous columnStats function to output the statistics for each column
+Then, it iterates through the list of materials, and for each one, filters the data frame to just the rows with that material.
+For each material, it then iterates through the list of radii, again filtering the dataframe to contain just rows with that radius, and then plots the remaining rows.
+Once every radius has been plotted, the plot is then shown, with the correct labels, title and legend.
+"""
+
 def part1():
    for i in range(len(columns)):
        if columns[i] == "Material":
@ -73,6 +99,14 @@ def part1():

 ####Part 2    

+"""
+This function performs all the operations for part 2.
+First, it removes the Material column from the dataframe, as this interferes with the subsequent operations
+It then uses the .corr() function to calculate thye correlations between the various parameters, and assigns this to a variable
+A plot is then made of this matrix, with bounds set to -1 to 1, and then the function iterates through earch tile on the plot and labels it with the relevant value
+Finally, a colour bar legend is added to the plot and the plot is given a title, and is then shown
+"""
+
 def part2():
    dfNoMaterial = df.drop("Material", axis=1)
    corrMatrix = dfNoMaterial.corr(method='pearson')
@ -90,10 +124,34 @@ def part2():

    fig.colorbar(im)
    fig.tight_layout()
+    fig.suptitle("Correlation between each parameter")
    plt.show()

 ####Part 3

+"""
+This function performs all the operations for section 3
+First, filtered dataframes are made, one with only the features affecting drop time, and one with just the drop time
+A linear regression of these values is then calculated using the sklearn function, and the coefficients are printed with their relevant units
+The dataframe is then filtered to contain only a single material, iron
+A function is then defined that takes values for density, radius, mass, temp, pressure and height as input, and then uses the coefficiients calculated by the linear fit
+to calculate and return a value for fall time.
+A function to predict the fall times using the linear fit is then defined.
+It first filters the dataframe by radius, then plots the ex,perimental, or 'true' data as a scatter plot
+The .predict function is then used, taking the dataframe of features as an input, to plot the predictions of each drop time based on fall distance.
+The fitByMeans function is then used, by passing the mean of each column and two values of drop height, and this data is used to plot a straigfht line of best fit
+The axis are given labels, and the plot is then shown.
+
+The data is then split randomly into 90% fo training data and 10% for test data.
+Again, the LinearRegression function is used to calculate a linear regression based thi time of the random smple of test data.
+For each radius, the dataframe is first filtered, and the true values of fall time are plotted, and their R^2 value is calculated.
+The .predict function is again used to calculate the predicted values for fall time based off the training set, and this is plotted on the same graph, and its R^2 value is also calculated
+The R^2 values are then printed, and the plot is shown.
+
+Finaly, a function to plot the residuals between the true and predicted data is defined.
+It finds the differnce between each true value for time and its predicted one, and then plots these residuals against radius.
+"""
+
 def part3():
    features = df[["Density", "Radius", "Mass", "Temperature", "Pressure", "Height"]]
    targets = df["Time"]
@ -158,6 +216,15 @@ def part3():

    calcResiduals()

+"""
+This function performs all the operations for section 4
+First, the dataframe is again split up into the columns for the features affecting drop time and drop time.
+The SDG regressor function is then used to calculate an unscaled linear fit of the data, and its R^2 value is calculated and printed.
+The data is then scaled using the StandardScaler function, and the scaled features are saved to a variable.
+The linear regression is then calculated again, its R^2 value is calculated and prnted, and the coefficients for each fearture are listed along with their relevant units
+This process is then repeated again using the huber loss function rather than the least squares function.
+"""
+
 def part4():
    reg = SGDRegressor()