Cleanup

2022-12-27 11:34:53 +01:00 · 2022-12-27 11:34:53 +01:00 · cd45a6bcb8
parent bd33f92903
commit cd45a6bcb8
2 changed files with 273 additions and 80 deletions
--- a/assignment6/solution.py
+++ b/assignment6/solution.py
@ -7,23 +7,44 @@ import uz_framework.text as uz_text
 import os

 def one_a() -> None:
-    print('Hello')
+    """
+    Solve the following assignment by hand for practice: You are given four points
+    A(3, 4), B(3, 6), C(7, 6) and D(6, 4). Calculate the eigenvectors and eigenvalues for
+    the given set of points.
+    """
+    print('Solved in notes')

 def one_b() -> None:
+    """
+    Write a script to calculate and visualize PCA from 2D data from the file points.txt
+    (the first column contains the x axis and the second column the y axis). Plot the
+    points and draw the representation of the Gaussian distribution using drawEllipse
+    from the supplementary material. Follow the Algorithm 1 to compute the eigenvectors and eigenvalues of the PCA subspace.
+    """
    points = np.loadtxt('./data/points.txt')

-    uz_image.compute_PCA(points)
+    uz_image.compute_PCA(points, plot=True)
+
+def one_c() -> None:
+    """
+    The matrix U contains the eigenvectors that represent the basis of our PCA subspace. Draw the eigenvectors on the plot from the previous task. Their origin should
+    lie at the mean of the data µ. Since both vectors have the length 1, a better way
+    for visualizing them is to multiply each vector with the corresponding eigenvalue.
+    Draw the first eigenvector with red and the second with green.
+    """
+    points = np.loadtxt('./data/points.txt')
+
+    uz_image.compute_PCA(points, plot=True)

 def one_d() -> None:
 	points = np.loadtxt('./data/points.txt')

-	U, S, VT = uz_image.compute_PCA(points)
+	U, S, VT, _ = uz_image.compute_PCA(points)

 	uz_image.plot_histogram_pca(U, S, VT)

 # U is the matrix of eigenvectors
 # S is the vector of eigenvalues
-# VT is the matrix of eigenvectors

 def one_e() -> None:
    """
@ -37,7 +58,7 @@ def one_e() -> None:
    """
    points = np.loadtxt('./data/points.txt')

-    U, S, VT, mean = uz_image.compute_PCA(points)
+    U, _, _, mean = uz_image.compute_PCA(points)
    
    # Drop all but first eigenvector
    U = U[:, 0:1]
@ -50,53 +71,128 @@ def one_e() -> None:

    # Plot the original points and the projected points
    plt.scatter(points[:, 0], points[:, 1], c='b', label='Original')
-    plt.scatter(x_i[:, 0], x_i[:, 1], c='r', label='Projected')
+    plt.scatter(x_i[:, 0], x_i[:, 1], c='r', label='Projected into a space of first eigenvector')
+    plt.legend()

    plt.show()

-
-def two_a():
+def one_f() -> None:
+    """
+    For the point qpoint = [6, 6]T
+    calculate the closest point from the input data (using
+    Euclidean distance). Which point is the closest? Then, project all the points (including qpoint) to PCA subspace (calculated without qpoint) and remove the variation
+    in the direction of the second vector. Calculate the distances again. Which point is
+    the closest to qpoint now? Visualize the reconstruction.
+    """
    points = np.loadtxt('./data/points.txt')
+    point = np.array([6,6])
+
+    # Find the closest point using euclidian distance
+    d = np.linalg.norm(points - point, axis=1)
+    closest_point = points[np.argmin(d)]
+
+    # Plot the original points
+    plt.scatter(points[:,0], points[:,1], c='b', label='Original')
+    plt.scatter(point[0], point[1], c='g', label=f'Point: {point}')
+    plt.scatter(closest_point[0], closest_point[1], c='r', label=f'Closest point: {closest_point}')
+
+    U, _, _, mean = uz_image.compute_PCA(points)
+ 
+    # Drop all but first eigenvector
+    U = U[:, 0:1]
+
+    # Reproject points
+    y_i = np.matmul(points - mean, U)
+
+    # Reproject back
+    x_i = np.matmul(y_i, U.T) + mean 
+    # Reproject point
+    point_repr = np.matmul(point - mean, U)
+    point_repr = np.matmul(point_repr, U.T) + mean
+    
+    # Find the closest point using euclidian distance
+    d = np.linalg.norm(x_i - point_repr, axis=1)
+    closest_point = x_i[np.argmin(d)]
+
+    # Plot tht reprojected points
+    plt.scatter(x_i[:,0], x_i[:,1], c='purple', label='Reprojected')
+    plt.scatter(point_repr[0], point_repr[1], c='y', label=f'repr Point: {point_repr}')
+    plt.scatter(closest_point[0], closest_point[1], c='orange', label=f'Closest point repr: {closest_point}')
+    
+    plt.legend()
+    plt.show()
+
+def two_a() -> None:
+    """
+    For our requirements it is necessary only to correctly calculate eigenvectors and
+    eigenvalues up to the scale factor. Therefore implement the dual method according
+    to the Algorithm 2 and test it using the data from points.txt. The first two
+    eigenvectors should be the same as with the Algorithm 1. The Algorithm 2 gives
+    you a larger matrix U, however, all eigenvectors but the first two equal to zero.
+    """
+    _ = np.loadtxt('./data/points.txt')

 def two_b():
+    """
+    Project the data from the previous assignment to the PCA space using matrix U, 
+    and then project the data back again in the original space. If you have implemented
+    the method correctly, you will get the original data (up to the numerical error).
+    """
    points = np.loadtxt('./data/points.txt')
-    U, S, VT, mean = dual_PCA(points)
+    U, _, _, mean = uz_image.dual_PCA(points)
    
-    print(U)
    y_i = np.matmul(points - mean, U)

    # Project points back into original subspace
    x_i = np.matmul(y_i, U.T) + mean

    # Plot the original points and the projected points
-    plt.scatter(points[:, 0], points[:, 1], c='b', label='Original')
-    plt.scatter(x_i[:, 0], x_i[:, 1], c='r', label='Projected')
+    fig, axs = plt.subplots(1,2)
+    fig.suptitle('DUAL PCA')
+    axs[0].scatter(points[:, 0], points[:, 1], c='b', label='Original')
+    axs[1].scatter(x_i[:, 0], x_i[:, 1], c='r', label='Projected')
+    fig.legend()
    plt.show()

-def dual_PCA(points):
-    X = points.T.copy()
-    mean = np.mean(X, axis=1)
-    X = X - mean[:, np.newaxis]
-    
-    # Compute the covariance matrix
-    C = (1/(X.shape[1]-1)) * X.T @ X
-
-    # Compute the eigenvalues and eigenvectors
-    U, S, VT = np.linalg.svd(C)
-    S += 1e-15
-    
-    U = X @ U @ np.sqrt(np.diag(1/(S * (X.shape[1]-1))))
-    return U, S, VT, mean
-
 def three_a():
-    imgs = read_images('./data/faces/1')
+    """
+    Data preparation: Firstly, we have to formulate the problem in a way that is
+    compatible with the PCA method. Since PCA operates on points in space, we can
+    represent a grayscale image of size m × n as a point in mn-dimensional space if we
+    reshape it into a vector. Write a function that reads all the images from one of the
+    series transforms them into grayscale reshapes them using np.reshape and stacks
+    the resulting column vectors into a matrix of size mn × 64
+    """
+    _ = uz_image.read_images('./data/faces/1')

 def three_b():
-    imgs = read_images('./data/faces/1')
-    U, S, VT, mean = dual_PCA(imgs)
+    """
+    Using dual PCA: Use dual PCA on the vectors of images. Write a function that
+    takes the matrix of image vectors as input and returns the eigenvectors of the PCA
+    subspace and the mean of the input data.
+    Note: In step 5 of Algorithm 2 be careful when computing the inverse of the S as
+    some of the eigenvalues can be very close to 0. Division by zero can cause numerical
+    errors when computing a matrix inverse. You have to take into account that the
+    matrix S is a diagonal matrix and must therefore have non-zero diagonal elements.
+    One way of solving this numerical problem is that we add a very small constant
+    value to the diagonal elements, e.g. 10−15.
+    Transform the first five eigenvectors using the np.reshape function back into a
+    matrix and display them as images. What do the resulting images represent (both
+    numerically and in the context of faces)?
+    Project the first image from the series to the PCA space and then back again. Is
+    the result the same? What do you notice when you change one dimension of the
+    vector in the image space (e.g. component with index 4074) to 0 and display the
+    image? Repeat a similar procedure for a vector in the PCA space (project image
+    in the PCA space, change one of the first five components to zero and project the
+    image back in the image space and display it as an image). What is the difference?
+    How many pixels are changed by the first operation and how many by the second
+    """
+    imgs = uz_image.read_images('./data/faces/1')
+    U, _, _, mean = uz_image.dual_PCA(imgs)
    
    # Plot those eigenvectors
    fig, axs = plt.subplots(1, 5)
+    fig.suptitle('5 strongest eigenvectors')
    for i in range(5):
        axs[i].imshow(U[:, i].reshape((96, 84)), cmap='gray')
    plt.show()
@ -107,10 +203,13 @@ def three_b():
    # Project images back into original subspace
    x_i = np.matmul(y_i, U.T) + mean

-    # Plot the original images and the projected images
-    fig, axs = plt.subplots(1, 5)
+    fig, axs = plt.subplots(2, 5)
+    fig.suptitle('Original images(top) vs reprojected images(bottom)')
    for i in range(5):
-        axs[i].imshow(imgs[i].reshape((96, 84)), cmap='gray')
+        axs[0,i].imshow(imgs[i].reshape((96, 84)), cmap='gray')
+
+    for i in range(5):
+        axs[1,i].imshow(x_i[i].reshape((96, 84)), cmap='gray')
    plt.show()

    img = imgs[0].copy()
@ -124,15 +223,15 @@ def three_b():
    # Project image back into original subspace
    x_i = np.matmul(y_i, U.T) + mean

+    # Count the difference in pixels
+    diff = (np.sum(np.abs(img - x_i)))
    # Plot the original image and the projected image
    fig, axs = plt.subplots(1, 2)
+    fig.suptitle(f'Add noise in cell 4074 before projection into PCA subspace, diff: {np.round(diff, 2)}')
    axs[0].imshow(img.reshape((96, 84)), cmap='gray')
    axs[1].imshow(x_i.reshape((96, 84)), cmap='gray')
    plt.show()

-    # Count the difference in pixels
-    print(np.sum(np.abs(img - x_i)))
-    
    img2 = imgs[0].copy()
    
    # Project image into PCA subspace
@ -143,36 +242,33 @@ def three_b():
    x_i = np.matmul(y_i, U.T) + mean

    # Plot the original image and the projected images
+    diff = (np.sum(np.abs(img - x_i)))
    fig, axs = plt.subplots(1, 2)
+    fig.suptitle(f'Add noise in PCA space, diff: {np.round(diff, 2)}')
    axs[0].imshow(img2.reshape((96, 84)), cmap='gray')
    axs[1].imshow(x_i.reshape((96, 84)), cmap='gray')
    plt.show()

-    # Count the number of pixels that are different
-    print(np.sum(np.abs(img - x_i)))
-
-
-def read_images(data_path):
-    imgs = np.array([])
-    for filename in os.listdir(data_path):
-        img = uz_image.imread_gray(os.path.join(data_path, filename), uz_image.ImageType.float64)
-        # Reshape image into a vector
-        img = img.reshape((img.shape[0] * img.shape[1], 1))
-        if imgs.size == 0:
-            imgs = img
-        else:
-            imgs = np.hstack((imgs, img))
-    return imgs.T
-
 def three_c():
-    imgs = read_images('./data/faces/1')
-    U, S, VT, mean = dual_PCA(imgs)
+    """
+    Effect of the number of components on the reconstruction: Take a random
+    image and project it into the PCA space. Then change the vector in the PCA space
+    by retaining only the first 32 components and setting the remaining components to
+    0. Project the resulting vector back to the image space and display it as an image.
+    Repeat the procedure for the first 16, 8, 4, 2, and one eigenvector. Display the
+    resulting vectors together on one figure. What do you notice?
+    """
+    imgs = uz_image.read_images('./data/faces/1')
+    U, _, _, mean = uz_image.dual_PCA(imgs)

    img = imgs[0].copy()

    values = [32, 16, 8, 4, 2, 1]
    
-    for value in values:
+    fig, axs = plt.subplots( 2, len(values))
+    fig.suptitle('Reprojection for different number of eigenvectors')
+    for ix, value in enumerate(values):
+
        # Projcet image into PCA subspace
        y_i = np.matmul(img - mean, U)
        y_i[value:] = 0
@ -180,18 +276,30 @@ def three_c():
        # Project image back into original subspace
        x_i = np.matmul(y_i, U.T) + mean

-        # Plot the original image and the projected images
-        fig, axs = plt.subplots(1, 2)
-        axs[0].imshow(img.reshape((96, 84)), cmap='gray')
-        axs[1].imshow(x_i.reshape((96, 84)), cmap='gray')
-        plt.show()
-
        # Count the number of pixels that are different
-        print(np.sum(np.abs(img - x_i)))
+        diff = (np.sum(np.abs(img - x_i)))
+
+        # Plot the original image and the projected images
+        axs[0, ix].imshow(img.reshape((96, 84)), cmap='gray')
+        axs[0, ix].set(title='Original image')
+        axs[1, ix].imshow(x_i.reshape((96, 84)), cmap='gray')
+        axs[1, ix].set(title=f'#eigenvectors: {value}, diff: {np.round(diff, 2)}')
+
+    plt.show()

 def three_e():
-    imgs = read_images('./data/faces/1')
-    U, S, VT, mean = dual_PCA(imgs)
+    """
+    Reconstruction of a foreign image: The PCA space is build upon
+    an array of data. In this task we will check the effect that the transformation to PCA
+    space has on an image that is not similar to the images that were used to build the
+    PCA space. Write a script that computes the PCA space for the first face dataset.
+    Then load the image elephant.jpg, reshape it into a vector and transform it into
+    the precalculated PCA space, then transform it back to image space. Reshape the
+    resulting vector back to an image and display both the original and the reconstructed
+    image. Comment on the results
+    """
+    imgs = uz_image.read_images('./data/faces/1')
+    U, _, _, mean = uz_image.dual_PCA(imgs)

    elephant = uz_image.imread_gray('./data/elephant.jpg', uz_image.ImageType.float64)
    elephant = elephant.reshape((elephant.shape[0] * elephant.shape[1], 1)).T
@ -202,30 +310,60 @@ def three_e():
    # Project image back into original subspace
    x_i = np.matmul(y_i, U.T) + mean

+    # Count the number of pixels that are different
+    diff = (np.sum(np.abs(elephant - x_i)))
+
    # Plot the original image and the projected images
    fig, axs = plt.subplots(1, 2)
+    fig.suptitle(f'Elephant reprojected into FACES subspace, diff:{np.round(diff, 2)}')
    axs[0].imshow(elephant.reshape((96, 84)), cmap='gray')
    axs[1].imshow(x_i.reshape((96, 84)), cmap='gray')
    plt.show()


+def three_g():
+    imgs_1 = uz_image.read_images('./data/faces/1')
+    imgs_2 = uz_image.read_images('./data/faces/2')
+    imgs_3 = uz_image.read_images('./data/faces/3')
+
+    imgs = np.vstack((imgs_1, imgs_2, imgs_3))
+
+    U, S, VT, mean = uz_image.dual_PCA(imgs)
+
+    # Reduce dimension of U to 2D
+    U_2d = U[:, :2].copy()
+
+    # Project images into PCA subspace
+    y_i = np.matmul(imgs - mean, U_2d)
+    y_i = -y_i
+
+    # Plot the projected images
+    fig, axs = plt.subplots(1, 2)
+    axs[0].scatter(y_i[:imgs_1.shape[0], 0], y_i[:imgs_1.shape[0], 1], c='r')
+    axs[0].scatter(y_i[imgs_1.shape[0]:imgs_1.shape[0] + imgs_2.shape[0], 0], y_i[imgs_1.shape[0]:imgs_1.shape[0] + imgs_2.shape[0], 1], c='g')
+    axs[0].scatter(y_i[imgs_1.shape[0] + imgs_2.shape[0]:, 0], y_i[imgs_1.shape[0] + imgs_2.shape[0]:, 1], c='b')


-# ######## #
-# SOLUTION #
-# ######## #
+    pts = LDA(U, 3, 64)
+
+    # Plot the projected images
+    axs[1].scatter(pts[:imgs_1.shape[0], 0], pts[:imgs_1.shape[0], 1], c='r')
+    axs[1].scatter(pts[imgs_1.shape[0]:imgs_1.shape[0] + imgs_2.shape[0], 0], pts[imgs_1.shape[0]:imgs_1.shape[0] + imgs_2.shape[0], 1], c='g')
+    axs[1].scatter(pts[imgs_1.shape[0] + imgs_2.shape[0]:, 0], pts[imgs_1.shape[0] + imgs_2.shape[0]:, 1], c='b')
+
+    plt.show()

 def main():
    #one_b()
 	#one_d()
    #one_e()
+    #one_f()
    #two_b()
    #three_a()
+    #three_b()
 	#three_c()
    three_e()
-	#two_c()
-	#ex2()
-	#ex3()
+    #three_g()

 if __name__ == '__main__':
 	main()
--- a/assignment6/uz_framework/image.py
+++ b/assignment6/uz_framework/image.py
@ -8,6 +8,7 @@ import enum
 from tqdm import tqdm
 from matplotlib.patches import Ellipse
 import matplotlib.transforms as transforms
+import os

 class ImageType(enum.Enum):
    uint8 = 0
@ -1724,13 +1725,14 @@ def drawEllipse(mu, cov, n_std=1):

 	plt.gca().add_patch(ellipse)

-def compute_PCA(points: npt.NDArray[np.float64]):
+def compute_PCA(points: npt.NDArray[np.float64], plot: bool = False):
    # 1. b)
    # Build matrix X
    X = points.T.copy()
    mean = np.mean(X, axis=1)

-    plt.scatter(X[0, :], X[1, :])
+    if (plot):
+        plt.scatter(X[0, :], X[1, :])

    # Center the data
    X = X - mean[:, np.newaxis]
@ -1738,21 +1740,22 @@ def compute_PCA(points: npt.NDArray[np.float64]):
    # Compute the covariabce matrix
    cov_matrix = (1/ (X.shape[1] - 1)) * (X @ X.T) # Popravi to

-    drawEllipse(mean, cov_matrix, n_std=1)
+    if (plot):
+        drawEllipse(mean, cov_matrix, n_std=1)

    U, S, VT = np.linalg.svd(cov_matrix)

    # 1. c)
+    if (plot):
+        # Plot the eigenvectors
+        plt.plot([mean[0], mean[0] + U[0,0] *np.sqrt(S[0])], [mean[1], mean[1] + U[0,1] *np.sqrt(S[0])], color='red')
+        plt.plot([mean[0], mean[0] + U[1,0] *np.sqrt(S[1])], [mean[1], mean[1] + U[1,1] *np.sqrt(S[1])], color='green')

-    # Plot the eigenvectors
-    plt.plot([mean[0], mean[0] + U[0,0] *np.sqrt(S[0])], [mean[1], mean[1] + U[0,1] *np.sqrt(S[0])], color='red')
-    plt.plot([mean[0], mean[0] + U[1,0] *np.sqrt(S[1])], [mean[1], mean[1] + U[1,1] *np.sqrt(S[1])], color='green')
-
-    plt.show()
+        plt.show()
    return U, S, VT, mean


-def plot_histogram_pca(U, S, VT):
+def plot_histogram_pca(U: npt.NDArray[np.float64], S: npt.NDArray[np.float64], VT: npt.NDArray[np.float64]):
    eigvals = S.copy()
    # Normalize eginevalues
    eigvals = eigvals / np.sum(eigvals)
@ -1760,3 +1763,55 @@ def plot_histogram_pca(U, S, VT):
    # Plot the histogram
    plt.bar(range(1, len(eigvals) + 1), eigvals, alpha=0.5, align='center')
    plt.show()
+
+def dual_PCA(points: npt.NDArray[np.float64]):
+    X = points.T.copy()
+    mean = np.mean(X, axis=1)
+    X = X - mean[:, np.newaxis]
+    
+    # Compute the covariance matrix
+    C = (1/(X.shape[1]-1)) * X.T @ X
+
+    # Compute the eigenvalues and eigenvectors
+    U, S, VT = np.linalg.svd(C)
+    S += 1e-15
+    
+    U = X @ U @ np.sqrt(np.diag(1/(S * (X.shape[1]-1))))
+    return U, S, VT, mean
+
+def read_images(data_path: str):
+    imgs = np.array([])
+    for filename in os.listdir(data_path):
+        img = imread_gray(os.path.join(data_path, filename), ImageType.float64)
+        # Reshape image into a vector
+        img = img.reshape((img.shape[0] * img.shape[1], 1))
+        if imgs.size == 0:
+            imgs = img
+        else:
+            imgs = np.hstack((imgs, img))
+    return imgs.T
+
+def LDA(points, c, n):
+    MM = np.mean(points)
+    
+    Ms = []
+    Sb = np.zeros((points.shape[1], points.shape[1]))
+    Sw = np.zeros((points.shape[1], points.shape[1]))
+
+    for i in range(c):
+        Ms.append(np.mean(points[i * n:(i + 1) * n], axis=0))
+        Sb += n * np.outer(Ms[i] - MM, Ms[i] - MM)
+        Sw += np.cov(points[i * n:(i + 1) * n].T)
+
+    Sw_inv = np.linalg.inv(Sw)
+    SWB = np.matmul(Sw_inv, Sb) 
+
+    eig_vals, eig_vecs = np.linalg.eig(SWB)
+
+    print(eig_vecs)
+
+    Ms = np.array(Ms)
+
+    Ms = eig_vecs.T.dot(Ms.T).T
+
+    return np.matmul(points, Ms.T)