Skip to content

Notebook for data preprocessing#

# Importing libraries
import pandas as pd
import numpy as np

# Read csv file into a pandas dataframe
df = pd.read_csv("property_data.csv")

# Take a look at the first few rows
df
PID ST_NUM ST_NAME OWN_OCCUPIED NUM_BEDROOMS NUM_BATH SQ_FT
0 100001000.0 104.0 PUTNAM Y 3 1.0 1000
1 100002000.0 197.0 LEXINGTON N 3 1.5 --
2 100003000.0 NaN LEXINGON N NaN 1.0 850
3 100004000.0 201.0 BERKELEY 12 1 NaN 700
4 NaN 203.0 BERKELEY Y 3 2.0 1600
5 100006000.0 207.0 BERKELEY Y NaN 1.0 800
6 100007000.0 NaN WASHINGTON NaN 2 NaN 950
7 100008000.0 213.0 tremont Y 1 1.0 NaN
8 100009000.0 215.0 TREMONT Y na 2.0 1800
9 100009000.0 215.0 TREMONT Y na 2.0 1800
# Looking at the ST_NUM column
df['ST_NUM'].isnull()
0    False
1    False
2     True
3    False
4    False
5    False
6     True
7    False
8    False
9    False
Name: ST_NUM, dtype: bool
df['ST_NUM'].isnull().sum()
2
# Looking at the NUM_BEDROOMS column
df['NUM_BEDROOMS'].isnull().sum()
2
# Making a list of missing value types
missing_values = ["n/a", "na", "--"]
df = pd.read_csv("property_data.csv", na_values = missing_values)
# Looking at the NUM_BEDROOMS column
df['NUM_BEDROOMS'].isnull().sum()
4
df
PID ST_NUM ST_NAME OWN_OCCUPIED NUM_BEDROOMS NUM_BATH SQ_FT
0 100001000.0 104.0 PUTNAM Y 3.0 1.0 1000.0
1 100002000.0 197.0 LEXINGTON N 3.0 1.5 NaN
2 100003000.0 NaN LEXINGON N NaN 1.0 850.0
3 100004000.0 201.0 BERKELEY 12 1.0 NaN 700.0
4 NaN 203.0 BERKELEY Y 3.0 2.0 1600.0
5 100006000.0 207.0 BERKELEY Y NaN 1.0 800.0
6 100007000.0 NaN WASHINGTON NaN 2.0 NaN 950.0
7 100008000.0 213.0 tremont Y 1.0 1.0 NaN
8 100009000.0 215.0 TREMONT Y NaN 2.0 1800.0
9 100009000.0 215.0 TREMONT Y NaN 2.0 1800.0
df['OWN_OCCUPIED'].isnull()
0    False
1    False
2    False
3    False
4    False
5    False
6     True
7    False
8    False
9    False
Name: OWN_OCCUPIED, dtype: bool
# Detecting numbers 
cnt=0
for row in df['OWN_OCCUPIED']:
    try:
        int(row)
        df.loc[cnt, 'OWN_OCCUPIED']=np.nan
    except ValueError:
        pass
    cnt+=1
df['OWN_OCCUPIED'].isnull()
0    False
1    False
2    False
3     True
4    False
5    False
6     True
7    False
8    False
9    False
Name: OWN_OCCUPIED, dtype: bool
df.isnull().sum()
PID             1
ST_NUM          2
ST_NAME         0
OWN_OCCUPIED    2
NUM_BEDROOMS    4
NUM_BATH        2
SQ_FT           2
dtype: int64
# Total number of missing value
df.isnull().sum().sum()
13
# Replace missing values with a number
df['ST_NUM'].fillna(125, inplace=True)
# Location based replacement
df.loc[2,'ST_NUM'] = 125
# Replace using median 
median = df['NUM_BEDROOMS'].median()
df['NUM_BEDROOMS'].fillna(median, inplace=True)
df.isnull().sum()
PID             1
ST_NUM          0
ST_NAME         0
OWN_OCCUPIED    2
NUM_BEDROOMS    0
NUM_BATH        2
SQ_FT           2
dtype: int64
# Replace using mode
df['NUM_BATH'].fillna(df['NUM_BATH'].mode()[0], inplace=True)
df
PID ST_NUM ST_NAME OWN_OCCUPIED NUM_BEDROOMS NUM_BATH SQ_FT
0 100001000.0 104.0 PUTNAM Y 3.0 1.0 1000.0
1 100002000.0 197.0 LEXINGTON N 3.0 1.5 NaN
2 100003000.0 125.0 LEXINGON N 2.5 1.0 850.0
3 100004000.0 201.0 BERKELEY NaN 1.0 1.0 700.0
4 NaN 203.0 BERKELEY Y 3.0 2.0 1600.0
5 100006000.0 207.0 BERKELEY Y 2.5 1.0 800.0
6 100007000.0 125.0 WASHINGTON NaN 2.0 1.0 950.0
7 100008000.0 213.0 tremont Y 1.0 1.0 NaN
8 100009000.0 215.0 TREMONT Y 2.5 2.0 1800.0
9 100009000.0 215.0 TREMONT Y 2.5 2.0 1800.0
df.dtypes
PID             float64
ST_NUM          float64
ST_NAME          object
OWN_OCCUPIED     object
NUM_BEDROOMS    float64
NUM_BATH        float64
SQ_FT           float64
dtype: object
df['OWN_OCCUPIED'].fillna('N', inplace=True)
df
PID ST_NUM ST_NAME OWN_OCCUPIED NUM_BEDROOMS NUM_BATH SQ_FT
0 100001000.0 104.0 PUTNAM Y 3.0 1.0 1000.0
1 100002000.0 197.0 LEXINGTON N 3.0 1.5 NaN
2 100003000.0 125.0 LEXINGON N 2.5 1.0 850.0
3 100004000.0 201.0 BERKELEY N 1.0 1.0 700.0
4 NaN 203.0 BERKELEY Y 3.0 2.0 1600.0
5 100006000.0 207.0 BERKELEY Y 2.5 1.0 800.0
6 100007000.0 125.0 WASHINGTON N 2.0 1.0 950.0
7 100008000.0 213.0 tremont Y 1.0 1.0 NaN
8 100009000.0 215.0 TREMONT Y 2.5 2.0 1800.0
9 100009000.0 215.0 TREMONT Y 2.5 2.0 1800.0
# Replace using mean
mean = df['SQ_FT'].mean()
df['SQ_FT'].fillna(mean, inplace=True)
df
PID ST_NUM ST_NAME OWN_OCCUPIED NUM_BEDROOMS NUM_BATH SQ_FT
0 100001000.0 104.0 PUTNAM Y 3.0 1.0 1000.0
1 100002000.0 197.0 LEXINGTON N 3.0 1.5 1187.5
2 100003000.0 125.0 LEXINGON N 2.5 1.0 850.0
3 100004000.0 201.0 BERKELEY N 1.0 1.0 700.0
4 NaN 203.0 BERKELEY Y 3.0 2.0 1600.0
5 100006000.0 207.0 BERKELEY Y 2.5 1.0 800.0
6 100007000.0 125.0 WASHINGTON N 2.0 1.0 950.0
7 100008000.0 213.0 tremont Y 1.0 1.0 1187.5
8 100009000.0 215.0 TREMONT Y 2.5 2.0 1800.0
9 100009000.0 215.0 TREMONT Y 2.5 2.0 1800.0
df1 = df.dropna(subset=['PID'])
df1
PID ST_NUM ST_NAME OWN_OCCUPIED NUM_BEDROOMS NUM_BATH SQ_FT
0 100001000.0 104.0 PUTNAM Y 3.0 1.0 1000.0
1 100002000.0 197.0 LEXINGTON N 3.0 1.5 1187.5
2 100003000.0 125.0 LEXINGON N 2.5 1.0 850.0
3 100004000.0 201.0 BERKELEY N 1.0 1.0 700.0
5 100006000.0 207.0 BERKELEY Y 2.5 1.0 800.0
6 100007000.0 125.0 WASHINGTON N 2.0 1.0 950.0
7 100008000.0 213.0 tremont Y 1.0 1.0 1187.5
8 100009000.0 215.0 TREMONT Y 2.5 2.0 1800.0
9 100009000.0 215.0 TREMONT Y 2.5 2.0 1800.0
df1.ST_NAME.unique()
array(['PUTNAM', 'LEXINGTON', 'LEXINGON', 'BERKELEY', 'WASHINGTON',
       'tremont', 'TREMONT'], dtype=object)
df1["ST_NAME"] = df1["ST_NAME"].apply(lambda x: x.replace("LEXINGON", "LEXINGTON"))
df1["ST_NAME"] = df1["ST_NAME"].apply(lambda x: x.upper())
<ipython-input-27-3a1ae30bd232>:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1["ST_NAME"] = df1["ST_NAME"].apply(lambda x: x.replace("LEXINGON", "LEXINGTON"))
<ipython-input-27-3a1ae30bd232>:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1["ST_NAME"] = df1["ST_NAME"].apply(lambda x: x.upper())
df1
PID ST_NUM ST_NAME OWN_OCCUPIED NUM_BEDROOMS NUM_BATH SQ_FT
0 100001000.0 104.0 PUTNAM Y 3.0 1.0 1000.0
1 100002000.0 197.0 LEXINGTON N 3.0 1.5 1187.5
2 100003000.0 125.0 LEXINGTON N 2.5 1.0 850.0
3 100004000.0 201.0 BERKELEY N 1.0 1.0 700.0
5 100006000.0 207.0 BERKELEY Y 2.5 1.0 800.0
6 100007000.0 125.0 WASHINGTON N 2.0 1.0 950.0
7 100008000.0 213.0 TREMONT Y 1.0 1.0 1187.5
8 100009000.0 215.0 TREMONT Y 2.5 2.0 1800.0
9 100009000.0 215.0 TREMONT Y 2.5 2.0 1800.0
df1.drop_duplicates(keep='last')
PID ST_NUM ST_NAME OWN_OCCUPIED NUM_BEDROOMS NUM_BATH SQ_FT
0 100001000.0 104.0 PUTNAM Y 3.0 1.0 1000.0
1 100002000.0 197.0 LEXINGTON N 3.0 1.5 1187.5
2 100003000.0 125.0 LEXINGTON N 2.5 1.0 850.0
3 100004000.0 201.0 BERKELEY N 1.0 1.0 700.0
5 100006000.0 207.0 BERKELEY Y 2.5 1.0 800.0
6 100007000.0 125.0 WASHINGTON N 2.0 1.0 950.0
7 100008000.0 213.0 TREMONT Y 1.0 1.0 1187.5
9 100009000.0 215.0 TREMONT Y 2.5 2.0 1800.0
df1['ST_NUM'] = df1['ST_NUM'].astype(str)
<ipython-input-30-09fc6084589a>:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['ST_NUM'] = df1['ST_NUM'].astype(str)
df1.dtypes
PID             float64
ST_NUM           object
ST_NAME          object
OWN_OCCUPIED     object
NUM_BEDROOMS    float64
NUM_BATH        float64
SQ_FT           float64
dtype: object

Tags: !AMLIndex