diff --git a/.gitignore b/.gitignore index 4920a7dadb12946081aaea55d6ae83a75552a730..ed9d09b912e8593c2ac24b21b9eea9a3823c3fa5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ *.pyc *.h5 -./HousesDataset \ No newline at end of file +/HousesDataset/ \ No newline at end of file diff --git a/datasets.py b/datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..d6f4163e82a7ecc10d3e55718baf77d3a917045b --- /dev/null +++ b/datasets.py @@ -0,0 +1,181 @@ +# -*- coding: utf-8 -*- +from enum import Enum + +__author__ = 106360 +import pandas as pd +from sklearn.preprocessing import LabelBinarizer +from sklearn.preprocessing import MinMaxScaler +from sklearn.model_selection import train_test_split +import numpy +import pandas as pd +import seaborn as sns +from sklearn.metrics import mean_squared_error +import scipy +import matplotlib.pyplot as plt +import tensorflow as tf +import skimage, skimage.io + + +class DatasetType(Enum): + Data = 1 + Image = 2 + Both = 3 + + +def get_dataset_image(df, train_data, test_data,image_size = 32,rewrite_image=False): + image_vals = ["bathroom_img","bedroom_img","kitchen_img","frontal_img"] + + train_x_dict = {} + test_x_dict = {} + + for key in image_vals: + + image_array = numpy.zeros((train_data[key].count(),image_size,image_size,3)) + for i,image_filename in enumerate(train_data[key]): + my_image = skimage.io.imread(image_filename).astype(float)/255.0 + my_image = skimage.transform.resize(my_image,(image_size,image_size),preserve_range=True) + image_array[i,:,:,:] = my_image + print('Loading %s' % image_filename) + if rewrite_image: + skimage.io.imsave(image_filename, (my_image*255.0).astype('uint8')) + train_x_dict[key] = image_array + + for key in image_vals: + image_array = numpy.zeros((test_data[key].count(),image_size,image_size,3)) + for i,image_filename in enumerate(test_data[key]): + my_image = skimage.io.imread(image_filename).astype(float)/255.0 + my_image = skimage.transform.resize(my_image,(image_size,image_size),preserve_range=True) + image_array[i,:,:,:] = my_image + print('Loading %s' % image_filename) + test_x_dict[key] = image_array + + #MOSAIC + image_array = numpy.zeros((train_data[key].count(), image_size, image_size, 3)) + for i in range(len(train_x_dict[image_vals[0]])): + + my_image_a = train_x_dict[image_vals[0]][i] + my_image_a = skimage.transform.resize(my_image_a, (image_size//2, image_size//2), preserve_range=True) + my_image_b = train_x_dict[image_vals[1]][i] + my_image_b = skimage.transform.resize(my_image_b, (image_size // 2, image_size // 2), preserve_range=True) + my_image_c = train_x_dict[image_vals[2]][i] + my_image_c = skimage.transform.resize(my_image_c, (image_size // 2, image_size // 2), preserve_range=True) + my_image_d = train_x_dict[image_vals[3]][i] + my_image_d = skimage.transform.resize(my_image_d, (image_size // 2, image_size // 2), preserve_range=True) + + image_array[i, 0:image_size // 2, 0:image_size // 2, :] = my_image_a + image_array[i, 0:image_size // 2, image_size // 2:image_size, :] = my_image_b + image_array[i, image_size // 2:image_size, 0:image_size // 2, :] = my_image_c + image_array[i, image_size // 2:image_size, image_size // 2:image_size, :] = my_image_d + + + train_x_dict["mosaic_img"] = image_array + + + image_array = numpy.zeros((test_data[key].count(), image_size, image_size, 3)) + for i in range(len(test_x_dict[image_vals[0]])): + + my_image_a = train_x_dict[image_vals[0]][i] + my_image_a = skimage.transform.resize(my_image_a, (image_size//2, image_size//2), preserve_range=True) + my_image_b = train_x_dict[image_vals[0]][i] + my_image_b = skimage.transform.resize(my_image_b, (image_size // 2, image_size // 2), preserve_range=True) + my_image_c = train_x_dict[image_vals[0]][i] + my_image_c = skimage.transform.resize(my_image_c, (image_size // 2, image_size // 2), preserve_range=True) + my_image_d = train_x_dict[image_vals[0]][i] + my_image_d = skimage.transform.resize(my_image_d, (image_size // 2, image_size // 2), preserve_range=True) + + + + image_array[i, 0:image_size // 2, 0:image_size // 2, :] = my_image_a + image_array[i, 0:image_size // 2, image_size // 2:image_size, :] = my_image_b + image_array[i, image_size // 2:image_size, 0:image_size // 2, :] = my_image_c + image_array[i, image_size // 2:image_size, image_size // 2:image_size, :] = my_image_d + + # plt.imshow(image_array[i]) + # plt.show() + test_x_dict["mosaic_img"] = image_array + + + return train_x_dict,test_x_dict + + + +def load_house_dataset_data(test_size=0.2,random_state=666,path='./HousesDataset/Houses Dataset',type=DatasetType.Data): + name_cols = ['bedrooms','bathrooms','area','zipcode','price'] + + inputPath = '%s/HousesInfo.txt' % path + df = pd.read_csv(inputPath,sep=' ',header=None,names=name_cols) + zipcodes = df["zipcode"].value_counts().keys().tolist() + counts = df["zipcode"].value_counts().tolist() + + bathroom_list = [] + bedroom_list = [] + frontal_list = [] + kitchen_list = [] + + for i in range(df["zipcode"].count()): + bathroom_list.append(path + '/%d_bathroom.jpg' %(i+1)) + bedroom_list.append(path + '/%d_bedroom.jpg' % (i+1)) + frontal_list.append(path + '/%d_kitchen.jpg' % (i+1)) + kitchen_list.append(path + '/%d_frontal.jpg' % (i+1)) + + df["bathroom_img"] = bathroom_list + df["bedroom_img"] = bedroom_list + df["kitchen_img"] = frontal_list + df["frontal_img"] = kitchen_list + + + + # loop over each of the unique zip codes and their corresponding + # count + for (zipcode, count) in zip(zipcodes, counts): + # the zip code counts for our housing dataset is *extremely* + # unbalanced (some only having 1 or 2 houses per zip code) + # so let's sanitize our data by removing any houses with less + # than 25 houses per zip code + if count < 25: + idxs = df[df["zipcode"] == zipcode].index + df.drop(idxs, inplace=True) + # return the data frame + + train_data, test_data = train_test_split(df, test_size=test_size,random_state=random_state) + + (trainX_data, trainY_data, testX_data, testY_data), normalizer_data = normalize_dataset_data(df, train_data, test_data) + if type!=DatasetType.Data: + trainX_img,testX_img = get_dataset_image(df, train_data, test_data) + else: + trainX_img = None + testX_img = None + + + return (trainX_data,trainX_img, trainY_data, testX_data,testX_img,testY_data), normalizer_data + + +def normalize_dataset_data(df, train_data, test_data): + continuous = ["bedrooms", "bathrooms", "area"] + # performing min-max scaling each continuous feature column to + # the range [0, 1] + cs = MinMaxScaler() + trainContinuous = cs.fit_transform(train_data[continuous]) + testContinuous = cs.transform(test_data[continuous]) + + # one-hot encode the zip code categorical data (by definition of + # one-hot encoing, all output features are now in the range [0, 1]) + zipBinarizer = LabelBinarizer().fit(df["zipcode"]) + trainCategorical = zipBinarizer.transform(train_data["zipcode"]) + testCategorical = zipBinarizer.transform(test_data["zipcode"]) + + # construct our training and testing data points by concatenating + # the categorical features with the continuous features + trainX = numpy.hstack([trainCategorical, trainContinuous]) + testX = numpy.hstack([testCategorical, testContinuous]) + + cs2 = MinMaxScaler() + trainY = cs2.fit_transform(numpy.array(train_data["price"]).reshape((-1,1))) + testY = cs2.transform(numpy.array(test_data["price"]).reshape((-1,1))) + + + # return the concatenated training and testing data + return (trainX, trainY, testX, testY),[cs,zipBinarizer,cs2] + +if __name__ == "__main__": + (trainX,trainX_img, trainY, testX,testX_img,testY), normalizer = load_house_dataset_data() diff --git a/dog_cats_example.py b/dog_cats_example.py new file mode 100644 index 0000000000000000000000000000000000000000..549f674f67e60d5f7cfddd36716c48a54e81ee6d --- /dev/null +++ b/dog_cats_example.py @@ -0,0 +1,399 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import, division, print_function, unicode_literals + +__author__ = 106360 +# !pip install tensorflow==2.0.0rc0 + + + +import tensorflow as tf + +from tensorflow.keras.models import Sequential +from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D +from tensorflow.keras.preprocessing.image import ImageDataGenerator + +import os +import numpy as np +import matplotlib.pyplot as plt + +"""## Load data + +Begin by downloading the dataset. This tutorial uses a filtered version of <a href="https://www.kaggle.com/c/dogs-vs-cats/data" target="_blank">Dogs vs Cats</a> dataset from Kaggle. Download the archive version of the dataset and store it in the "/tmp/" directory. +""" + +_URL = 'https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip' + +path_to_zip = tf.keras.utils.get_file('cats_and_dogs.zip', origin=_URL, extract=True) + +PATH = os.path.join(os.path.dirname(path_to_zip), 'cats_and_dogs_filtered') + +"""The dataset has the following directory structure: + +<pre> +<b>cats_and_dogs_filtered</b> +|__ <b>train</b> + |______ <b>cats</b>: [cat.0.jpg, cat.1.jpg, cat.2.jpg ....] + |______ <b>dogs</b>: [dog.0.jpg, dog.1.jpg, dog.2.jpg ...] +|__ <b>validation</b> + |______ <b>cats</b>: [cat.2000.jpg, cat.2001.jpg, cat.2002.jpg ....] + |______ <b>dogs</b>: [dog.2000.jpg, dog.2001.jpg, dog.2002.jpg ...] +</pre> + +After extracting its contents, assign variables with the proper file path for the training and validation set. +""" + +train_dir = os.path.join(PATH, 'train') +validation_dir = os.path.join(PATH, 'validation') + +train_cats_dir = os.path.join(train_dir, 'cats') # directory with our training cat pictures +train_dogs_dir = os.path.join(train_dir, 'dogs') # directory with our training dog pictures +validation_cats_dir = os.path.join(validation_dir, 'cats') # directory with our validation cat pictures +validation_dogs_dir = os.path.join(validation_dir, 'dogs') # directory with our validation dog pictures + +"""### Understand the data + +Let's look at how many cats and dogs images are in the training and validation directory: +""" + +num_cats_tr = len(os.listdir(train_cats_dir)) +num_dogs_tr = len(os.listdir(train_dogs_dir)) + +num_cats_val = len(os.listdir(validation_cats_dir)) +num_dogs_val = len(os.listdir(validation_dogs_dir)) + +total_train = num_cats_tr + num_dogs_tr +total_val = num_cats_val + num_dogs_val + +print('total training cat images:', num_cats_tr) +print('total training dog images:', num_dogs_tr) + +print('total validation cat images:', num_cats_val) +print('total validation dog images:', num_dogs_val) +print("--") +print("Total training images:", total_train) +print("Total validation images:", total_val) + +"""For convenience, set up variables to use while pre-processing the dataset and training the network.""" + +batch_size = 128 +epochs = 15 +IMG_HEIGHT = 150 +IMG_WIDTH = 150 + +"""## Data preparation + +Format the images into appropriately pre-processed floating point tensors before feeding to the network: + +1. Read images from the disk. +2. Decode contents of these images and convert it into proper grid format as per their RGB content. +3. Convert them into floating point tensors. +4. Rescale the tensors from values between 0 and 255 to values between 0 and 1, as neural networks prefer to deal with small input values. + +Fortunately, all these tasks can be done with the `ImageDataGenerator` class provided by `tf.keras`. It can read images from disk and preprocess them into proper tensors. It will also set up generators that convert these images into batches of tensors—helpful when training the network. +""" + +train_image_generator = ImageDataGenerator(rescale=1./255) # Generator for our training data +validation_image_generator = ImageDataGenerator(rescale=1./255) # Generator for our validation data + +"""After defining the generators for training and validation images, the `flow_from_directory` method load images from the disk, applies rescaling, and resizes the images into the required dimensions.""" + +train_data_gen = train_image_generator.flow_from_directory(batch_size=batch_size, + directory=train_dir, + shuffle=True, + target_size=(IMG_HEIGHT, IMG_WIDTH), + class_mode='binary') + +val_data_gen = validation_image_generator.flow_from_directory(batch_size=batch_size, + directory=validation_dir, + target_size=(IMG_HEIGHT, IMG_WIDTH), + class_mode='binary') + +"""### Visualize training images + +Visualize the training images by extracting a batch of images from the training generator—which is 32 images in this example—then plot five of them with `matplotlib`. +""" + +sample_training_images, _ = next(train_data_gen) + +"""The `next` function returns a batch from the dataset. The return value of `next` function is in form of `(x_train, y_train)` where x_train is training features and y_train, its labels. Discard the labels to only visualize the training images.""" + +# This function will plot images in the form of a grid with 1 row and 5 columns where images are placed in each column. +def plotImages(images_arr): + fig, axes = plt.subplots(1, 5, figsize=(20,20)) + axes = axes.flatten() + for img, ax in zip( images_arr, axes): + ax.imshow(img) + ax.axis('off') + plt.tight_layout() + plt.show() + +plotImages(sample_training_images[:5]) + +"""## Create the model + +The model consists of three convolution blocks with a max pool layer in each of them. There's a fully connected layer with 512 units on top of it thatr is activated by a `relu` activation function. The model outputs class probabilities based on binary classification by the `sigmoid` activation function. +""" + +model = Sequential([ + Conv2D(16, 3, padding='same', activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH ,3)), + MaxPooling2D(), + Conv2D(32, 3, padding='same', activation='relu'), + MaxPooling2D(), + Conv2D(64, 3, padding='same', activation='relu'), + MaxPooling2D(), + Flatten(), + Dense(512, activation='relu'), + Dense(1, activation='sigmoid') +]) + +"""### Compile the model + +For this tutorial, choose the *ADAM* optimizer and *binary cross entropy* loss function. To view training and validation accuracy for each training epoch, pass the `metrics` argument. +""" + +model.compile(optimizer='adam', + loss='binary_crossentropy', + metrics=['accuracy']) + +"""### Model summary + +View all the layers of the network using the model's `summary` method: +""" + +model.summary() + +"""### Train the model + +Use the `fit_generator` method of the `ImageDataGenerator` class to train the network. +""" + +history = model.fit_generator( + train_data_gen, + steps_per_epoch=total_train // batch_size, + epochs=epochs, + validation_data=val_data_gen, + validation_steps=total_val // batch_size +) + +"""### Visualize training results + +Now visualize the results after training the network. +""" + +acc = history.history['accuracy'] +val_acc = history.history['val_accuracy'] + +loss = history.history['loss'] +val_loss = history.history['val_loss'] + +epochs_range = range(epochs) + +plt.figure(figsize=(8, 8)) +plt.subplot(1, 2, 1) +plt.plot(epochs_range, acc, label='Training Accuracy') +plt.plot(epochs_range, val_acc, label='Validation Accuracy') +plt.legend(loc='lower right') +plt.title('Training and Validation Accuracy') + +plt.subplot(1, 2, 2) +plt.plot(epochs_range, loss, label='Training Loss') +plt.plot(epochs_range, val_loss, label='Validation Loss') +plt.legend(loc='upper right') +plt.title('Training and Validation Loss') +plt.show() + +"""As you can see from the plots, training accuracy and validation accuracy are off by large margin and the model has achieved only around **70%** accuracy on the validation set. + +Let's look at what went wrong and try to increase overall performance of the model. + +## Overfitting + +In the plots above, the training accuracy is increasing linearly over time, whereas validation accuracy stalls around 70% in the training process. Also, the difference in accuracy between training and validation accuracy is noticeable—a sign of *overfitting*. + +When there are a small number of training examples, the model sometimes learns from noises or unwanted details from training examples—to an extent that it negatively impacts the performance of the model on new examples. This phenomenon is known as overfitting. It means that the model will have a difficult time generalizing on a new dataset. + +There are multiple ways to fight overfitting in the training process. In this tutorial, you'll use *data augmentation* and add *dropout* to our model. + +## Data augmentation + +Overfitting generally occurs when there are a small number of training examples. One way to fix this problem is to augment the dataset so that it has a sufficient number of training examples. Data augmentation takes the approach of generating more training data from existing training samples by augmenting the samples using random transformations that yield believable-looking images. The goal is the model will never see the exact same picture twice during training. This helps expose the model to more aspects of the data and generalize better. + +Implement this in `tf.keras` using the `ImageDataGenerator` class. Pass different transformations to the dataset and it will take care of applying it during the training process. + +### Augment and visualize data + +Begin by applying random horizontal flip augmentation to the dataset and see how individual images look like after the transformation. + +### Apply horizontal flip + +Pass `horizontal_flip` as an argument to the `ImageDataGenerator` class and set it to `True` to apply this augmentation. +""" + +image_gen = ImageDataGenerator(rescale=1./255, horizontal_flip=True) + +train_data_gen = image_gen.flow_from_directory(batch_size=batch_size, + directory=train_dir, + shuffle=True, + target_size=(IMG_HEIGHT, IMG_WIDTH)) + +"""Take one sample image from the training examples and repeat it five times so that the augmentation is applied to the same image five times.""" + +augmented_images = [train_data_gen[0][0][0] for i in range(5)] + +# Re-use the same custom plotting function defined and used +# above to visualize the training images +plotImages(augmented_images) + +"""### Randomly rotate the image + +Let's take a look at a different augmentation called rotation and apply 45 degrees of rotation randomly to the training examples. +""" + +image_gen = ImageDataGenerator(rescale=1./255, rotation_range=45) + +train_data_gen = image_gen.flow_from_directory(batch_size=batch_size, + directory=train_dir, + shuffle=True, + target_size=(IMG_HEIGHT, IMG_WIDTH)) + +augmented_images = [train_data_gen[0][0][0] for i in range(5)] + +plotImages(augmented_images) + +"""### Apply zoom augmentation + +Apply a zoom augmentation to the dataset to zoom images up to 50% randomly. +""" + +image_gen = ImageDataGenerator(rescale=1./255, zoom_range=0.5) + +train_data_gen = image_gen.flow_from_directory(batch_size=batch_size, + directory=train_dir, + shuffle=True, + target_size=(IMG_HEIGHT, IMG_WIDTH)) + +augmented_images = [train_data_gen[0][0][0] for i in range(5)] + +plotImages(augmented_images) + +"""### Put it all together + +Apply all the previous augmentations. Here, you applied rescale, 45 degree rotation, width shift, height shift, horizontal flip and zoom augmentation to the training images. +""" + +image_gen_train = ImageDataGenerator( + rescale=1./255, + rotation_range=45, + width_shift_range=.15, + height_shift_range=.15, + horizontal_flip=True, + zoom_range=0.5 + ) + +train_data_gen = image_gen_train.flow_from_directory(batch_size=batch_size, + directory=train_dir, + shuffle=True, + target_size=(IMG_HEIGHT, IMG_WIDTH), + class_mode='binary') + +"""Visualize how a single image would look five different times when passing these augmentations randomly to the dataset.""" + +augmented_images = [train_data_gen[0][0][0] for i in range(5)] +plotImages(augmented_images) + +"""### Create validation data generator + +Generally, only apply data augmentation to the training examples. In this case, only rescale the validation images and convert them into batches using `ImageDataGenerator`. +""" + +image_gen_val = ImageDataGenerator(rescale=1./255) + +val_data_gen = image_gen_val.flow_from_directory(batch_size=batch_size, + directory=validation_dir, + target_size=(IMG_HEIGHT, IMG_WIDTH), + class_mode='binary') + +"""## Dropout + +Another technique to reduce overfitting is to introduce *dropout* to the network. It is a form of *regularization* that forces the weights in the network to take only small values, which makes the distribution of weight values more regular and the network can reduce overfitting on small training examples. Dropout is one of the regularization technique used in this tutorial + +When you apply dropout to a layer it randomly drops out (set to zero) number of output units from the applied layer during the training process. Dropout takes a fractional number as its input value, in the form such as 0.1, 0.2, 0.4, etc. This means dropping out 10%, 20% or 40% of the output units randomly from the applied layer. + +When appling 0.1 dropout to a certain layer, it randomly kills 10% of the output units in each training epoch. + +Create a network architecture with this new dropout feature and apply it to different convolutions and fully-connected layers. + +## Creating a new network with Dropouts + +Here, you apply dropout to first and last max pool layers and to a fully connected layer that has 512 output units. 30% of the first and last max pool layer, and 10% of fully connected layer output units, are randomly set to zero during each training epoch. +""" + +model_new = Sequential([ + Conv2D(16, 3, padding='same', activation='relu', + input_shape=(IMG_HEIGHT, IMG_WIDTH ,3)), + MaxPooling2D(), + Dropout(0.2), + Conv2D(32, 3, padding='same', activation='relu'), + MaxPooling2D(), + Conv2D(64, 3, padding='same', activation='relu'), + MaxPooling2D(), + Dropout(0.2), + Flatten(), + Dense(512, activation='relu'), + Dense(1, activation='sigmoid') +]) + +"""### Compile the model + +After introducing dropouts to the network, compile the model and view the layers summary. +""" + +model_new.compile(optimizer='adam', + loss='binary_crossentropy', + metrics=['accuracy']) + +model_new.summary() + +"""### Train the model + +After successfully introducing data augmentations to the training examples and adding dropouts to the network, train this new network: +""" + +history = model_new.fit_generator( + train_data_gen, + steps_per_epoch=total_train // batch_size, + epochs=epochs, + validation_data=val_data_gen, + validation_steps=total_val // batch_size +) + +"""### Visualize the model + +Visualize the new model after training, you can see that there is significantly less overfitting than before. The accuracy should go up after training the model for more epochs. +""" + +acc = history.history['accuracy'] +val_acc = history.history['val_accuracy'] + +loss = history.history['loss'] +val_loss = history.history['val_loss'] + +epochs_range = range(epochs) + +plt.figure(figsize=(8, 8)) +plt.subplot(1, 2, 1) +plt.plot(epochs_range, acc, label='Training Accuracy') +plt.plot(epochs_range, val_acc, label='Validation Accuracy') +plt.legend(loc='lower right') +plt.title('Training and Validation Accuracy') + +plt.subplot(1, 2, 2) +plt.plot(epochs_range, loss, label='Training Loss') +plt.plot(epochs_range, val_loss, label='Validation Loss') +plt.legend(loc='upper right') +plt.title('Training and Validation Loss') +plt.show() + + +if __name__ == "__main__": + pass \ No newline at end of file diff --git a/download_dataset.sh b/download_dataset.sh new file mode 100644 index 0000000000000000000000000000000000000000..a5f423258de70d382117f67ff854b25f1f23711a --- /dev/null +++ b/download_dataset.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash +mkdir HousesDataset +cd HousesDataset +git clone https://github.com/emanhamed/Houses-dataset diff --git a/house_prices_estimation_example_data.py b/house_prices_estimation_example_data.py new file mode 100644 index 0000000000000000000000000000000000000000..daffbba509de3e1e74ba6dc45622978931b0bb0e --- /dev/null +++ b/house_prices_estimation_example_data.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- + +from tensorflow_core.python.keras.layers.core import Dense, Dropout +from tensorflow_core.python.keras.models import Sequential +from tensorflow_core.python.keras.optimizer_v2.adam import Adam + +from datasets import load_house_dataset_data +from train_and_evaluation import evaluate_regression_model, train_model +import matplotlib.pyplot as plt + +__author__ = 106360 + +def generate_simple_regression_model(input_shape): + model = Sequential() + model.add(Dense(16, input_dim=input_shape, activation="relu")) + # model.add(Dropout(0.25)) + model.add(Dense(6, activation="relu")) + # model.add(Dropout(0.25)) + model.add(Dense(1, activation="sigmoid")) + + return model + + +if __name__ == "__main__": + (trainX,trainX_img, trainY, testX,testX_img,testY), normalizer = load_house_dataset_data(test_size=0.2,random_state=666) + input_shape = trainX.shape[1] + model = generate_simple_regression_model(input_shape) + + opt = Adam(lr=1e-3, decay=1e-3 / 200) + model.compile(loss='mean_squared_error',metrics=['mean_absolute_percentage_error','mean_absolute_error','mean_squared_error'], optimizer=opt) + model.summary() + + model = train_model(trainX, trainY, testX, testY,model,show_plot=True,epochs=500,batch_size=32) + evaluate_regression_model(model,testX,testY,normalizer,show_plot=True) + model.save('regression_model_data.h5') \ No newline at end of file diff --git a/house_prices_estimation_example_image.py b/house_prices_estimation_example_image.py new file mode 100644 index 0000000000000000000000000000000000000000..a2d1343d80d2cc52f3b865e053cb9f11e1957ded --- /dev/null +++ b/house_prices_estimation_example_image.py @@ -0,0 +1,161 @@ +# -*- coding: utf-8 -*- +import os + +from tensorflow_core.python.keras.datasets import cifar10 +from tensorflow_core.python.keras.engine.input_layer import Input +from tensorflow_core.python.keras.layers.convolutional import Conv2D +from tensorflow_core.python.keras.layers.core import Dense, Dropout, Activation, Flatten +from tensorflow_core.python.keras.layers.normalization import BatchNormalization +from tensorflow_core.python.keras.layers.pooling import MaxPooling2D +from tensorflow_core.python.keras.models import Sequential, Model, load_model +from tensorflow_core.python.keras.optimizer_v2.adam import Adam +import tensorflow as tf + +from train_and_evaluation import evaluate_regression_model +from datasets import load_house_dataset_data, DatasetType +import matplotlib.pyplot as plt + +__author__ = 106360 + +def generate_simple_cnn_regression_model(input_shape,n_blocks=2,weights='',is_regression=True,num_classes=1,freeze=False): + # define the model input + inputs = Input(shape=(input_shape,input_shape,3)) + # loop over the number of filters + x = inputs + for n in range(n_blocks): + x = Conv2D(16, (3, 3), padding="same",name='conv_%d' % n)(x) + x = Activation("relu")(x) + x = BatchNormalization()(x) + x = MaxPooling2D()(x) + + x= Flatten()(x) + + if is_regression: + y = Dense(num_classes, activation="sigmoid",name='last_dense_reg')(x) + else: + y = Dense(num_classes, activation="softmax", name='last_dense_clf')(x) + + model = Model(inputs,y) + + if weights!='': + model.load_weights(weights,by_name=True) + + if freeze: + for layer in model.layers[:-1]: + layer.trainable = False + + return model + +def train_cifar100(num_classes=100,batch_size=32): + (x_train, y_train), (x_test, y_test) = cifar10.load_data() + y_train = tf.keras.utils.to_categorical(y_train, num_classes) + y_test = tf.keras.utils.to_categorical(y_test, num_classes) + + x_train = x_train.astype('float32') + x_test = x_test.astype('float32') + x_train /= 255 + x_test /= 255 + + print('Using real-time data augmentation.') + # This will do preprocessing and realtime data augmentation: + datagen = tf.keras.preprocessing.image.ImageDataGenerator( + featurewise_center=False, # set input mean to 0 over the dataset + samplewise_center=False, # set each sample mean to 0 + featurewise_std_normalization=False, # divide inputs by std of the dataset + samplewise_std_normalization=False, # divide each input by its std + zca_whitening=False, # apply ZCA whitening + zca_epsilon=1e-06, # epsilon for ZCA whitening + rotation_range=0, # randomly rotate images in the range (degrees, 0 to 180) + # randomly shift images horizontally (fraction of total width) + width_shift_range=0.1, + # randomly shift images vertically (fraction of total height) + height_shift_range=0.1, + shear_range=0., # set range for random shear + zoom_range=0., # set range for random zoom + channel_shift_range=0., # set range for random channel shifts + # set mode for filling points outside the input boundaries + fill_mode='nearest', + cval=0., # value used for fill_mode = "constant" + horizontal_flip=True, # randomly flip images + vertical_flip=False, # randomly flip images + # set rescaling factor (applied before any other transformation) + rescale=None, + # set function that will be applied on each input + preprocessing_function=None, + # image data format, either "channels_first" or "channels_last" + data_format=None, + # fraction of images reserved for validation (strictly between 0 and 1) + validation_split=0.0) + + # Compute quantities required for feature-wise normalization + # (std, mean, and principal components if ZCA whitening is applied). + datagen.fit(x_train) + + dataset_train = datagen.flow(x_train, y_train, batch_size=batch_size) + dataset_test = datagen.flow(x_test, y_test, batch_size=batch_size) + num_steps_train = x_train.shape[0] // batch_size + num_steps_test = y_train.shape[0] // batch_size + + + + + model = generate_simple_cnn_regression_model(32, is_regression=False, num_classes=100) + opt = Adam(lr=1e-3, decay=1e-3 / 200) + model.compile(loss='categorical_crossentropy', + metrics=['categorical_crossentropy', 'accuracy'], + optimizer=opt) + model.summary() + + + history = model.fit_generator(dataset_train, steps_per_epoch=num_steps_train, + epochs=20, + validation_data=(x_test, y_test), + workers=1) + return model + + +if __name__ == "__main__": + pre_train_with_cifar100 = False + weights = '' + if pre_train_with_cifar100: + file_weight_cifar100 = 'pretrained_cifar100.h5' + + try: + model = load_model(file_weight_cifar100) + except: + model = train_cifar100() + model.save(file_weight_cifar100) + weights = file_weight_cifar100 + + + (trainX_data,trainX_img, trainY, testX_data,testX_img,testY), normalizer = load_house_dataset_data(test_size=0.2,random_state=666,type=DatasetType.Both) + + + trainX = trainX_img['frontal_img'] + testX = testX_img['frontal_img'] + input_shape = trainX.shape[1] + + if pre_train_with_cifar100: + file_weight_finetune = 'regression_model_image_finetune.h5' + model = generate_simple_cnn_regression_model(input_shape,weights=weights,freeze=True) + opt = Adam(lr=1e-3, decay=1e-3 / 200) + model.compile(loss='mean_squared_error',metrics=['mean_absolute_percentage_error','mean_absolute_error','mean_squared_error'], optimizer=opt) + model.summary() + model = train_model(trainX, trainY, testX, testY,model,show_plot=True,epochs=500,batch_size=32) + evaluate_regression_model(model,testX,testY,normalizer,show_plot=True) + model.save(file_weight_finetune) + weights = file_weight_finetune + final_model_weight = 'regression_model_image_pretrained.h5' + else: + final_model_weight = 'regression_model_image_from_scratch.h5' + + + model = generate_simple_cnn_regression_model(input_shape, weights=weights) + opt = Adam(lr=1e-3, decay=1e-3 / 200) + model.compile(loss='mean_squared_error', + metrics=['mean_absolute_percentage_error', 'mean_absolute_error', 'mean_squared_error'], + optimizer=opt) + model.summary() + model = train_model(trainX, trainY, testX, testY, model, show_plot=True, epochs=500, batch_size=32) + evaluate_regression_model(model, testX, testY, normalizer, show_plot=True) + model.save(final_model_weight) diff --git a/house_prices_estimation_example_image_and_data.py b/house_prices_estimation_example_image_and_data.py new file mode 100644 index 0000000000000000000000000000000000000000..88cd8f7f69bac2be33fe9304eb63fcc25c93b5f6 --- /dev/null +++ b/house_prices_estimation_example_image_and_data.py @@ -0,0 +1,95 @@ +# -*- coding: utf-8 -*- +import os + +from tensorflow_core.python.keras.datasets import cifar10 +from tensorflow_core.python.keras.engine.input_layer import Input +from tensorflow_core.python.keras.layers.convolutional import Conv2D +from tensorflow_core.python.keras.layers.core import Dense, Dropout, Activation, Flatten +from tensorflow_core.python.keras.layers.normalization import BatchNormalization +from tensorflow_core.python.keras.layers.pooling import MaxPooling2D +from tensorflow_core.python.keras.models import Sequential, Model, load_model +from tensorflow_core.python.keras.optimizer_v2.adam import Adam +import tensorflow as tf + +from utils import plot_regression, load_house_dataset_data, DatasetType +import matplotlib.pyplot as plt + +__author__ = 106360 + + +def train_model(trainX, trainY, testX, testY, model, epochs=200, batch_size=16,show_plot=True): + history = model.fit(trainX, trainY, validation_data=(testX, testY), + epochs=epochs, batch_size=batch_size) + + if show_plot: + acc = history.history['mean_absolute_percentage_error'] + val_acc = history.history['val_mean_absolute_percentage_error'] + loss = history.history['loss'] + val_loss = history.history['val_loss'] + + epochs_range = range(epochs) + + plt.figure(figsize=(8, 8)) + plt.subplot(1, 2, 1) + plt.plot(epochs_range, acc, label='Training mean_absolute_percentage_error') + plt.plot(epochs_range, val_acc, label='Validation mean_absolute_percentage_error') + plt.legend(loc='upper right') + plt.title('Training and Validation \% error') + + plt.subplot(1, 2, 2) + plt.plot(epochs_range, loss, label='Training Loss') + plt.plot(epochs_range, val_loss, label='Validation Loss') + plt.legend(loc='upper right') + plt.title('Training and Validation Loss') + plt.show() + + return model + + + + +if __name__ == "__main__": + pre_train_with_cifar100 = False + weights = '' + if pre_train_with_cifar100: + file_weight_cifar100 = 'pretrained_cifar100.h5' + + try: + model = load_model(file_weight_cifar100) + except: + model = train_cifar100() + model.save(file_weight_cifar100) + weights = file_weight_cifar100 + + + (trainX_data,trainX_img, trainY, testX_data,testX_img,testY), normalizer = load_house_dataset_data(test_size=0.2,random_state=666,type=DatasetType.Both) + + + trainX = trainX_img['frontal_img'] + testX = testX_img['frontal_img'] + input_shape = trainX.shape[1] + + if pre_train_with_cifar100: + file_weight_finetune = 'regression_model_image_finetune.h5' + model = generate_simple_cnn_regression_model(input_shape,weights=weights,freeze=True) + opt = Adam(lr=1e-3, decay=1e-3 / 200) + model.compile(loss='mean_squared_error',metrics=['mean_absolute_percentage_error','mean_absolute_error','mean_squared_error'], optimizer=opt) + model.summary() + model = train_model(trainX, trainY, testX, testY,model,show_plot=True,epochs=500,batch_size=32) + evaluate_regression_model(model,testX,testY,normalizer,show_plot=True) + model.save(file_weight_finetune) + weights = file_weight_finetune + final_model_weight = 'regression_model_image_pretrained.h5' + else: + final_model_weight = 'regression_model_image_from_scratch.h5' + + + model = generate_simple_cnn_regression_model(input_shape, weights=weights) + opt = Adam(lr=1e-3, decay=1e-3 / 200) + model.compile(loss='mean_squared_error', + metrics=['mean_absolute_percentage_error', 'mean_absolute_error', 'mean_squared_error'], + optimizer=opt) + model.summary() + model = train_model(trainX, trainY, testX, testY, model, show_plot=True, epochs=500, batch_size=32) + evaluate_regression_model(model, testX, testY, normalizer, show_plot=True) + model.save(final_model_weight) diff --git a/train_and_evaluation.py b/train_and_evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..40cd4305358bbf72eb68f645f9c4061adf04ef3a --- /dev/null +++ b/train_and_evaluation.py @@ -0,0 +1,75 @@ +# -*- coding: utf-8 -*- +__author__ = 106360 + + +import numpy +import pandas as pd +import seaborn as sns +from sklearn.metrics import mean_squared_error +import scipy +import matplotlib.pyplot as plt + + +def train_model(trainX, trainY, testX, testY, model, epochs=200, batch_size=16,show_plot=True): + history = model.fit(trainX, trainY, validation_data=(testX, testY), + epochs=epochs, batch_size=batch_size) + + if show_plot: + acc = history.history['mean_absolute_percentage_error'] + val_acc = history.history['val_mean_absolute_percentage_error'] + loss = history.history['loss'] + val_loss = history.history['val_loss'] + + epochs_range = range(epochs) + + plt.figure(figsize=(8, 8)) + plt.subplot(1, 2, 1) + plt.plot(epochs_range, acc, label='Training mean_absolute_percentage_error') + plt.plot(epochs_range, val_acc, label='Validation mean_absolute_percentage_error') + plt.legend(loc='upper right') + plt.title('Training and Validation \% error') + + plt.subplot(1, 2, 2) + plt.plot(epochs_range, loss, label='Training Loss') + plt.plot(epochs_range, val_loss, label='Validation Loss') + plt.legend(loc='upper right') + plt.title('Training and Validation Loss') + plt.show() + + return model + +def evaluate_regression_model(model,testX, testY,normalizer,show_plot=True): + predicted_Y = model.predict(testX) + test_Y_unnormalized = normalizer[-1].inverse_transform(testY) + predicted_Y_unnormalized = normalizer[-1].inverse_transform(predicted_Y) + + if show_plot: + plot_regression(test_Y_unnormalized,predicted_Y_unnormalized) + +def plot_regression(gt_array, pred_array, filename='',show_plot=True): + # fig, ax = plt.subplots(1, 1) + x, y = pd.Series(numpy.squeeze(gt_array), name="ground_truth_val"), pd.Series(numpy.squeeze(pred_array), name="predicted_val") + g= sns.jointplot(x=x, y=y, marker='.') + # sns.regplot(x=x, y=y, ax=ax, marker=None, scatter=None) + + # g = sns.jointplot(x, y, kind="hex", color="#5d5d60", joint_kws={'gridsize': 40, 'bins': 'log'}, xlim=(0, 1), ylim=(0, 1), stat_func=None) + sns.regplot(pd.Series(numpy.arange(0, 1.001, 0.01)), pd.Series(numpy.arange(0, 1.001, 0.01)), ax=g.ax_joint, scatter=True) + sns.regplot(x, y, ax=g.ax_joint, scatter=True) + + slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(numpy.squeeze(gt_array), numpy.squeeze(pred_array)) + rms = numpy.sqrt(mean_squared_error(pred_array, gt_array)) + if show_plot: + figure_title = 'rms=%.2f, r_value = %.2f, p_value = %.2f, y=%.2f*x + %.2f ' % (rms, r_value, p_value, slope, intercept) + g.fig.suptitle(figure_title) + # fig.suptitle(figure_title) + print(figure_title) + plt.tight_layout() + plt.show() + if filename!='': + g.fig.savefig(filename, dpi=600) + return slope, intercept, r_value, p_value, std_err + + + +if __name__ == "__main__": + pass \ No newline at end of file