#Let's get rid of some imports %matplotlib inline import matplotlib.pyplot as plt import numpy as np #Define the model import torch import torch.nn as nn import torch.nn.functional as F
defread_text(): with open("boston_housing.txt") as f: r_list = [i.replace("\n", "") for i in f.readlines()]
des_part = r_list[7:21] data_part = r_list[22:]
des_part = [s.split()[0].strip() for s in des_part] data_part = [ data_part[i] + data_part[i + 1] for i in range(0, len(data_part), 2) ] df = pd.read_csv(StringIO("\n".join(data_part)), sep="\s+", header=None, names=des_part)
# print(df.head()) return df
boston_df = read_text() boston_df.head()
CRIM
ZN
INDUS
CHAS
NOX
RM
AGE
DIS
RAD
TAX
PTRATIO
B
LSTAT
MEDV
0
0.00632
18.0
2.31
0
0.538
6.575
65.2
4.0900
1
296.0
15.3
396.90
4.98
24.0
1
0.02731
0.0
7.07
0
0.469
6.421
78.9
4.9671
2
242.0
17.8
396.90
9.14
21.6
2
0.02729
0.0
7.07
0
0.469
7.185
61.1
4.9671
2
242.0
17.8
392.83
4.03
34.7
3
0.03237
0.0
2.18
0
0.458
6.998
45.8
6.0622
3
222.0
18.7
394.63
2.94
33.4
4
0.06905
0.0
2.18
0
0.458
7.147
54.2
6.0622
3
222.0
18.7
396.90
5.33
36.2
属性意义:
CHAS 和 RAD 应该是离散量
- CRIM per capita crime rate by town
- ZN proportion of residential land zoned for lots over 25,000 sq.ft.
- INDUS proportion of non-retail business acres per town
- CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
- NOX nitric oxides concentration (parts per 10 million)
- RM average number of rooms per dwelling
- AGE proportion of owner-occupied units built prior to 1940
- DIS weighted distances to five Boston employment centres
- RAD index of accessibility to radial highways
- TAX full-value property-tax rate per 10,000
- PTRATIO pupil-teacher ratio by town
- B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
- LSTAT % lower status of the population
- MEDV Median value of owner-occupied homes in 1000's
接下来检查数据特征和质量
1
boston_df.describe()
CRIM
ZN
INDUS
CHAS
NOX
RM
AGE
DIS
RAD
TAX
PTRATIO
B
LSTAT
MEDV
count
506.000000
506.000000
506.000000
506.000000
506.000000
506.000000
506.000000
506.000000
506.000000
506.000000
506.000000
506.000000
506.000000
506.000000
mean
3.613524
11.363636
11.136779
0.069170
0.554695
6.284634
68.574901
3.795043
9.549407
408.237154
18.455534
356.674032
12.653063
22.532806
std
8.601545
23.322453
6.860353
0.253994
0.115878
0.702617
28.148861
2.105710
8.707259
168.537116
2.164946
91.294864
7.141062
9.197104
min
0.006320
0.000000
0.460000
0.000000
0.385000
3.561000
2.900000
1.129600
1.000000
187.000000
12.600000
0.320000
1.730000
5.000000
25%
0.082045
0.000000
5.190000
0.000000
0.449000
5.885500
45.025000
2.100175
4.000000
279.000000
17.400000
375.377500
6.950000
17.025000
50%
0.256510
0.000000
9.690000
0.000000
0.538000
6.208500
77.500000
3.207450
5.000000
330.000000
19.050000
391.440000
11.360000
21.200000
75%
3.677082
12.500000
18.100000
0.000000
0.624000
6.623500
94.075000
5.188425
24.000000
666.000000
20.200000
396.225000
16.955000
25.000000
max
88.976200
100.000000
27.740000
1.000000
0.871000
8.780000
100.000000
12.126500
24.000000
711.000000
22.000000
396.900000
37.970000
50.000000
1 2 3
import numpy as np #check for missing values print(np.sum(np.isnan(boston_df)))
CRIM 0
ZN 0
INDUS 0
CHAS 0
NOX 0
RM 0
AGE 0
DIS 0
RAD 0
TAX 0
PTRATIO 0
B 0
LSTAT 0
MEDV 0
dtype: int64
分析数据
首先,让我们关注因变量,大部分情况下存在正态分布,其中一些位于分布的顶端,我们稍后将进行探讨。
然后关注数据集中的相关系数分布
1 2 3 4
#Let's us seaborn, because it is pretty. ;) #See more here. http://seaborn.pydata.org/tutorial/distributions.html import seaborn as sns sns.displot(boston_df['MEDV']);
defget_tag_columns(df, limit=10): '''find cols contains continuous data''' ret = [] for col in df.columns: if df[col].nunique() < limit: ret.append(col) return ret
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# Standard ss = StandardScaler() X = np.concatenate([ss.fit_transform(one_hot_df), df.drop(tags, axis=1).values], axis=1) return X.astype("float32"), y.astype("float32")
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
#This will throw and error at import if haven't upgraded. # from sklearn.cross_validation import train_test_split from sklearn.model_selection import train_test_split #y is the dependent variable. # y = boston_df['MEDV'].values # # As we know, iloc is used to slice the array by index number. Here this is the matrix of # # independent variables. # X = boston_df.iloc[:,0:-1].values
X, y = preprocess(boston_df)
# Split the data into a training set and a test set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
#Calculate some other hyperparameters based on data. batch_no = len(X_train) // batch_size #batches cols=X_train.shape[1] #Number of columns in input matrix n_output=1
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
#Create the model device = torch.device("cuda:0"if torch.cuda.is_available() else"cpu") # Assume that we are on a CUDA machine, then this should print a CUDA device: print("Executing the model on :",device) classNet(torch.nn.Module): def__init__(self, n_feature, size_hidden, n_output): super(Net, self).__init__() self.hidden = torch.nn.Linear(cols, size_hidden) # hidden layer self.predict = torch.nn.Linear(size_hidden, n_output) # output layer
defforward(self, x): x = F.relu(self.hidden(x)) # activation function for hidden layer x = self.predict(x) # linear output return x net = Net(cols, size_hidden, n_output)
Executing the model on : cuda:0
1 2 3 4
#Adam is a specific flavor of gradient decent which is typically better optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate) #optimizer = torch.optim.SGD(net.parameters(), lr=0.2) criterion = torch.nn.MSELoss(reduction='sum') # this is for regression mean squared loss
import pandas as pd from sklearn.metrics import r2_score
running_loss = 0.0 for epoch in range(num_epochs): #Shuffle just mixes up the dataset between epocs # X_train, y_train = shuffle(X_train, y_train) # Mini batch learning net.train() for inputs,labels in train_loader: # start = i * batch_size # end = start + batch_size # inputs = Variable(torch.FloatTensor(X_train[start:end])) # labels = Variable(torch.FloatTensor(y_train[start:end])) # zero the parameter gradients optimizer.zero_grad()