Notebook for data preprocessing
# Importing libraries
import pandas as pd
import numpy as np
# Read csv file into a pandas dataframe
df = pd.read_csv("property_data.csv")
# Take a look at the first few rows
df
|
PID |
ST_NUM |
ST_NAME |
OWN_OCCUPIED |
NUM_BEDROOMS |
NUM_BATH |
SQ_FT |
0 |
100001000.0 |
104.0 |
PUTNAM |
Y |
3 |
1.0 |
1000 |
1 |
100002000.0 |
197.0 |
LEXINGTON |
N |
3 |
1.5 |
-- |
2 |
100003000.0 |
NaN |
LEXINGON |
N |
NaN |
1.0 |
850 |
3 |
100004000.0 |
201.0 |
BERKELEY |
12 |
1 |
NaN |
700 |
4 |
NaN |
203.0 |
BERKELEY |
Y |
3 |
2.0 |
1600 |
5 |
100006000.0 |
207.0 |
BERKELEY |
Y |
NaN |
1.0 |
800 |
6 |
100007000.0 |
NaN |
WASHINGTON |
NaN |
2 |
NaN |
950 |
7 |
100008000.0 |
213.0 |
tremont |
Y |
1 |
1.0 |
NaN |
8 |
100009000.0 |
215.0 |
TREMONT |
Y |
na |
2.0 |
1800 |
9 |
100009000.0 |
215.0 |
TREMONT |
Y |
na |
2.0 |
1800 |
# Looking at the ST_NUM column
df['ST_NUM'].isnull()
0 False
1 False
2 True
3 False
4 False
5 False
6 True
7 False
8 False
9 False
Name: ST_NUM, dtype: bool
df['ST_NUM'].isnull().sum()
# Looking at the NUM_BEDROOMS column
df['NUM_BEDROOMS'].isnull().sum()
# Making a list of missing value types
missing_values = ["n/a", "na", "--"]
df = pd.read_csv("property_data.csv", na_values = missing_values)
# Looking at the NUM_BEDROOMS column
df['NUM_BEDROOMS'].isnull().sum()
|
PID |
ST_NUM |
ST_NAME |
OWN_OCCUPIED |
NUM_BEDROOMS |
NUM_BATH |
SQ_FT |
0 |
100001000.0 |
104.0 |
PUTNAM |
Y |
3.0 |
1.0 |
1000.0 |
1 |
100002000.0 |
197.0 |
LEXINGTON |
N |
3.0 |
1.5 |
NaN |
2 |
100003000.0 |
NaN |
LEXINGON |
N |
NaN |
1.0 |
850.0 |
3 |
100004000.0 |
201.0 |
BERKELEY |
12 |
1.0 |
NaN |
700.0 |
4 |
NaN |
203.0 |
BERKELEY |
Y |
3.0 |
2.0 |
1600.0 |
5 |
100006000.0 |
207.0 |
BERKELEY |
Y |
NaN |
1.0 |
800.0 |
6 |
100007000.0 |
NaN |
WASHINGTON |
NaN |
2.0 |
NaN |
950.0 |
7 |
100008000.0 |
213.0 |
tremont |
Y |
1.0 |
1.0 |
NaN |
8 |
100009000.0 |
215.0 |
TREMONT |
Y |
NaN |
2.0 |
1800.0 |
9 |
100009000.0 |
215.0 |
TREMONT |
Y |
NaN |
2.0 |
1800.0 |
df['OWN_OCCUPIED'].isnull()
0 False
1 False
2 False
3 False
4 False
5 False
6 True
7 False
8 False
9 False
Name: OWN_OCCUPIED, dtype: bool
# Detecting numbers
cnt=0
for row in df['OWN_OCCUPIED']:
try:
int(row)
df.loc[cnt, 'OWN_OCCUPIED']=np.nan
except ValueError:
pass
cnt+=1
df['OWN_OCCUPIED'].isnull()
0 False
1 False
2 False
3 True
4 False
5 False
6 True
7 False
8 False
9 False
Name: OWN_OCCUPIED, dtype: bool
PID 1
ST_NUM 2
ST_NAME 0
OWN_OCCUPIED 2
NUM_BEDROOMS 4
NUM_BATH 2
SQ_FT 2
dtype: int64
# Total number of missing value
df.isnull().sum().sum()
# Replace missing values with a number
df['ST_NUM'].fillna(125, inplace=True)
# Location based replacement
df.loc[2,'ST_NUM'] = 125
# Replace using median
median = df['NUM_BEDROOMS'].median()
df['NUM_BEDROOMS'].fillna(median, inplace=True)
PID 1
ST_NUM 0
ST_NAME 0
OWN_OCCUPIED 2
NUM_BEDROOMS 0
NUM_BATH 2
SQ_FT 2
dtype: int64
# Replace using mode
df['NUM_BATH'].fillna(df['NUM_BATH'].mode()[0], inplace=True)
|
PID |
ST_NUM |
ST_NAME |
OWN_OCCUPIED |
NUM_BEDROOMS |
NUM_BATH |
SQ_FT |
0 |
100001000.0 |
104.0 |
PUTNAM |
Y |
3.0 |
1.0 |
1000.0 |
1 |
100002000.0 |
197.0 |
LEXINGTON |
N |
3.0 |
1.5 |
NaN |
2 |
100003000.0 |
125.0 |
LEXINGON |
N |
2.5 |
1.0 |
850.0 |
3 |
100004000.0 |
201.0 |
BERKELEY |
NaN |
1.0 |
1.0 |
700.0 |
4 |
NaN |
203.0 |
BERKELEY |
Y |
3.0 |
2.0 |
1600.0 |
5 |
100006000.0 |
207.0 |
BERKELEY |
Y |
2.5 |
1.0 |
800.0 |
6 |
100007000.0 |
125.0 |
WASHINGTON |
NaN |
2.0 |
1.0 |
950.0 |
7 |
100008000.0 |
213.0 |
tremont |
Y |
1.0 |
1.0 |
NaN |
8 |
100009000.0 |
215.0 |
TREMONT |
Y |
2.5 |
2.0 |
1800.0 |
9 |
100009000.0 |
215.0 |
TREMONT |
Y |
2.5 |
2.0 |
1800.0 |
PID float64
ST_NUM float64
ST_NAME object
OWN_OCCUPIED object
NUM_BEDROOMS float64
NUM_BATH float64
SQ_FT float64
dtype: object
df['OWN_OCCUPIED'].fillna('N', inplace=True)
|
PID |
ST_NUM |
ST_NAME |
OWN_OCCUPIED |
NUM_BEDROOMS |
NUM_BATH |
SQ_FT |
0 |
100001000.0 |
104.0 |
PUTNAM |
Y |
3.0 |
1.0 |
1000.0 |
1 |
100002000.0 |
197.0 |
LEXINGTON |
N |
3.0 |
1.5 |
NaN |
2 |
100003000.0 |
125.0 |
LEXINGON |
N |
2.5 |
1.0 |
850.0 |
3 |
100004000.0 |
201.0 |
BERKELEY |
N |
1.0 |
1.0 |
700.0 |
4 |
NaN |
203.0 |
BERKELEY |
Y |
3.0 |
2.0 |
1600.0 |
5 |
100006000.0 |
207.0 |
BERKELEY |
Y |
2.5 |
1.0 |
800.0 |
6 |
100007000.0 |
125.0 |
WASHINGTON |
N |
2.0 |
1.0 |
950.0 |
7 |
100008000.0 |
213.0 |
tremont |
Y |
1.0 |
1.0 |
NaN |
8 |
100009000.0 |
215.0 |
TREMONT |
Y |
2.5 |
2.0 |
1800.0 |
9 |
100009000.0 |
215.0 |
TREMONT |
Y |
2.5 |
2.0 |
1800.0 |
# Replace using mean
mean = df['SQ_FT'].mean()
df['SQ_FT'].fillna(mean, inplace=True)
|
PID |
ST_NUM |
ST_NAME |
OWN_OCCUPIED |
NUM_BEDROOMS |
NUM_BATH |
SQ_FT |
0 |
100001000.0 |
104.0 |
PUTNAM |
Y |
3.0 |
1.0 |
1000.0 |
1 |
100002000.0 |
197.0 |
LEXINGTON |
N |
3.0 |
1.5 |
1187.5 |
2 |
100003000.0 |
125.0 |
LEXINGON |
N |
2.5 |
1.0 |
850.0 |
3 |
100004000.0 |
201.0 |
BERKELEY |
N |
1.0 |
1.0 |
700.0 |
4 |
NaN |
203.0 |
BERKELEY |
Y |
3.0 |
2.0 |
1600.0 |
5 |
100006000.0 |
207.0 |
BERKELEY |
Y |
2.5 |
1.0 |
800.0 |
6 |
100007000.0 |
125.0 |
WASHINGTON |
N |
2.0 |
1.0 |
950.0 |
7 |
100008000.0 |
213.0 |
tremont |
Y |
1.0 |
1.0 |
1187.5 |
8 |
100009000.0 |
215.0 |
TREMONT |
Y |
2.5 |
2.0 |
1800.0 |
9 |
100009000.0 |
215.0 |
TREMONT |
Y |
2.5 |
2.0 |
1800.0 |
df1 = df.dropna(subset=['PID'])
|
PID |
ST_NUM |
ST_NAME |
OWN_OCCUPIED |
NUM_BEDROOMS |
NUM_BATH |
SQ_FT |
0 |
100001000.0 |
104.0 |
PUTNAM |
Y |
3.0 |
1.0 |
1000.0 |
1 |
100002000.0 |
197.0 |
LEXINGTON |
N |
3.0 |
1.5 |
1187.5 |
2 |
100003000.0 |
125.0 |
LEXINGON |
N |
2.5 |
1.0 |
850.0 |
3 |
100004000.0 |
201.0 |
BERKELEY |
N |
1.0 |
1.0 |
700.0 |
5 |
100006000.0 |
207.0 |
BERKELEY |
Y |
2.5 |
1.0 |
800.0 |
6 |
100007000.0 |
125.0 |
WASHINGTON |
N |
2.0 |
1.0 |
950.0 |
7 |
100008000.0 |
213.0 |
tremont |
Y |
1.0 |
1.0 |
1187.5 |
8 |
100009000.0 |
215.0 |
TREMONT |
Y |
2.5 |
2.0 |
1800.0 |
9 |
100009000.0 |
215.0 |
TREMONT |
Y |
2.5 |
2.0 |
1800.0 |
array(['PUTNAM', 'LEXINGTON', 'LEXINGON', 'BERKELEY', 'WASHINGTON',
'tremont', 'TREMONT'], dtype=object)
df1["ST_NAME"] = df1["ST_NAME"].apply(lambda x: x.replace("LEXINGON", "LEXINGTON"))
df1["ST_NAME"] = df1["ST_NAME"].apply(lambda x: x.upper())
<ipython-input-27-3a1ae30bd232>:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df1["ST_NAME"] = df1["ST_NAME"].apply(lambda x: x.replace("LEXINGON", "LEXINGTON"))
<ipython-input-27-3a1ae30bd232>:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df1["ST_NAME"] = df1["ST_NAME"].apply(lambda x: x.upper())
|
PID |
ST_NUM |
ST_NAME |
OWN_OCCUPIED |
NUM_BEDROOMS |
NUM_BATH |
SQ_FT |
0 |
100001000.0 |
104.0 |
PUTNAM |
Y |
3.0 |
1.0 |
1000.0 |
1 |
100002000.0 |
197.0 |
LEXINGTON |
N |
3.0 |
1.5 |
1187.5 |
2 |
100003000.0 |
125.0 |
LEXINGTON |
N |
2.5 |
1.0 |
850.0 |
3 |
100004000.0 |
201.0 |
BERKELEY |
N |
1.0 |
1.0 |
700.0 |
5 |
100006000.0 |
207.0 |
BERKELEY |
Y |
2.5 |
1.0 |
800.0 |
6 |
100007000.0 |
125.0 |
WASHINGTON |
N |
2.0 |
1.0 |
950.0 |
7 |
100008000.0 |
213.0 |
TREMONT |
Y |
1.0 |
1.0 |
1187.5 |
8 |
100009000.0 |
215.0 |
TREMONT |
Y |
2.5 |
2.0 |
1800.0 |
9 |
100009000.0 |
215.0 |
TREMONT |
Y |
2.5 |
2.0 |
1800.0 |
df1.drop_duplicates(keep='last')
|
PID |
ST_NUM |
ST_NAME |
OWN_OCCUPIED |
NUM_BEDROOMS |
NUM_BATH |
SQ_FT |
0 |
100001000.0 |
104.0 |
PUTNAM |
Y |
3.0 |
1.0 |
1000.0 |
1 |
100002000.0 |
197.0 |
LEXINGTON |
N |
3.0 |
1.5 |
1187.5 |
2 |
100003000.0 |
125.0 |
LEXINGTON |
N |
2.5 |
1.0 |
850.0 |
3 |
100004000.0 |
201.0 |
BERKELEY |
N |
1.0 |
1.0 |
700.0 |
5 |
100006000.0 |
207.0 |
BERKELEY |
Y |
2.5 |
1.0 |
800.0 |
6 |
100007000.0 |
125.0 |
WASHINGTON |
N |
2.0 |
1.0 |
950.0 |
7 |
100008000.0 |
213.0 |
TREMONT |
Y |
1.0 |
1.0 |
1187.5 |
9 |
100009000.0 |
215.0 |
TREMONT |
Y |
2.5 |
2.0 |
1800.0 |
df1['ST_NUM'] = df1['ST_NUM'].astype(str)
<ipython-input-30-09fc6084589a>:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df1['ST_NUM'] = df1['ST_NUM'].astype(str)
PID float64
ST_NUM object
ST_NAME object
OWN_OCCUPIED object
NUM_BEDROOMS float64
NUM_BATH float64
SQ_FT float64
dtype: object
Tags: !AMLIndex