FE Climate Change
Como remover outliers de séries temporais
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
df = pd.read_csv("daily_serie_train.csv")
df.head()
df.info()
df.isna().sum()
df.date = pd.DatetimeIndex(df.date.values)
df = df.set_index('date')
df.head()
df.describe()
Existem outliers, segundo boxplot anteriores do EDA, em:
- humidade
- pressão atmosférica
- velocidade do vento
def remove_outliers(df, col):
q1 = df[col].quantile(0.25)
q3 = df[col].quantile(0.75)
iqr = q3-q1
df = df[(df[col]>(q1-1.5*iqr)) & (df[col]<(q3+1.5*iqr))]
return df
for col in ["wind_speed","meanpressure"]:
df=remove_outliers(df, col)
df[['meanpressure']].boxplot()
plt.show()
selected_features = ['humidity', 'wind_speed']
target = ['meantemp']
df = df[selected_features + target]
train = df[df.index<"2016-08-01"]
test = df[df.index>="2016-08-01"]
train['month'] = train.index.month
train = train.join(train.groupby('month').agg({'meantemp':'mean'}).reset_index(), on='month', rsuffix='_month')
test['month'] = test.index.month
test = test.join(train.groupby('month').agg({'meantemp':'mean'}).reset_index(), on='month', rsuffix='_month')
train.drop(columns=['month','month_month'], inplace=True)
test.drop(columns=['month','month_month'], inplace=True)
from sklearn.preprocessing import MinMaxScaler
min_max = MinMaxScaler()
train = pd.DataFrame(data = min_max.fit_transform(train), columns=train.columns)
test = pd.DataFrame(data =min_max.transform(test), columns=test.columns)
train.head()
def remove_outliers(df, col):
q1 = df[col].quantile(0,25)
q3 = df[col].quantile(0,75)
iqr = q3-q1
df[(df[col]>(q1-1.5*iqr)) & (df[col]<(q3+1.5*iqr))]
return df