import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

df = pd.read_csv("daily_serie_train.csv")

df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          1462 non-null   object 
 1   meantemp      1462 non-null   float64
 2   humidity      1462 non-null   float64
 3   wind_speed    1462 non-null   float64
 4   meanpressure  1462 non-null   float64
dtypes: float64(4), object(1)
memory usage: 57.2+ KB

df.isna().sum()

date            0
meantemp        0
humidity        0
wind_speed      0
meanpressure    0
dtype: int64

df.date = pd.DatetimeIndex(df.date.values)
df = df.set_index('date')
df.head()

df.describe()

Existem outliers, segundo boxplot anteriores do EDA, em:

humidade
pressão atmosférica
velocidade do vento

def remove_outliers(df, col):
  q1 = df[col].quantile(0.25)
  q3 = df[col].quantile(0.75)
  iqr = q3-q1
  df = df[(df[col]>(q1-1.5*iqr)) & (df[col]<(q3+1.5*iqr))]
  return df

for col in ["wind_speed","meanpressure"]:
  df=remove_outliers(df, col)

df[['meanpressure']].boxplot()
plt.show()

selected_features = ['humidity', 'wind_speed']
target = ['meantemp']
df = df[selected_features + target]

train = df[df.index<"2016-08-01"] 
test = df[df.index>="2016-08-01"]

train['month'] = train.index.month
train = train.join(train.groupby('month').agg({'meantemp':'mean'}).reset_index(), on='month', rsuffix='_month')

/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.

test['month'] = test.index.month
test = test.join(train.groupby('month').agg({'meantemp':'mean'}).reset_index(), on='month', rsuffix='_month')

/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.

train.drop(columns=['month','month_month'], inplace=True)
test.drop(columns=['month','month_month'], inplace=True)

from sklearn.preprocessing import MinMaxScaler

min_max = MinMaxScaler()

train = pd.DataFrame(data = min_max.fit_transform(train), columns=train.columns)

test = pd.DataFrame(data =min_max.transform(test), columns=test.columns)

train.head()

def remove_outliers(df, col):
  q1 = df[col].quantile(0,25)
  q3 = df[col].quantile(0,75)
  iqr = q3-q1
  df[(df[col]>(q1-1.5*iqr)) & (df[col]<(q3+1.5*iqr))]
  return df

	date	meantemp	humidity	wind_speed	meanpressure
0	2013-01-01	10.000000	84.500000	0.000000	1015.666667
1	2013-01-02	7.400000	92.000000	2.980000	1017.800000
2	2013-01-03	7.166667	87.000000	4.633333	1018.666667
3	2013-01-04	8.666667	71.333333	1.233333	1017.166667
4	2013-01-05	6.000000	86.833333	3.700000	1016.500000

	meantemp	humidity	wind_speed	meanpressure
date
2013-01-01	10.000000	84.500000	0.000000	1015.666667
2013-01-02	7.400000	92.000000	2.980000	1017.800000
2013-01-03	7.166667	87.000000	4.633333	1018.666667
2013-01-04	8.666667	71.333333	1.233333	1017.166667
2013-01-05	6.000000	86.833333	3.700000	1016.500000

	meantemp	humidity	wind_speed	meanpressure
count	1462.000000	1462.000000	1462.000000	1462.000000
mean	25.495521	60.771702	6.802209	1011.104548
std	7.348103	16.769652	4.561602	180.231668
min	6.000000	13.428571	0.000000	-3.041667
25%	18.857143	50.375000	3.475000	1001.580357
50%	27.714286	62.625000	6.221667	1008.563492
75%	31.305804	72.218750	9.238235	1014.944901
max	38.714286	100.000000	42.220000	7679.333333

	humidity	wind_speed	meantemp	meantemp_month
0	0.840372	0.000000	0.122271	0.140201
1	0.929054	0.173888	0.042795	0.140201
2	0.869932	0.270362	0.035662	0.140201
3	0.684685	0.071967	0.081514	0.140201
4	0.867962	0.215901	0.000000	0.140201