我创建了一个很小的数据集,其中存在精确的线性关系。代码如下:
import numpy as np
def gen_data(n, k):
np.random.seed(5711)
beta = np.random.uniform(0, 1, size=(k, 1))
print("beta is:", beta)
X = np.random.normal(size=(n, k))
y = X.dot(beta).reshape(-1, 1)
D = np.concatenate([X, y], axis=1)
return D.astype(np.float32)
现在我已经安装了一个带有 SGD 优化器和 MSE 损失的 pyTorch 神经网络,它在 50 个时期内近似收敛到真实值,学习率为 1e-1
我尝试在张量流中设置完全相同的模型:
import keras.layers
from sklearn.model_selection import train_test_split
from keras.models import Sequential
import tensorflow as tf
n = 10
k = 2
X = gen_data(n, k)
D_train, D_test = train_test_split(X, test_size=0.2)
X_train, y_train = D_train[:,:k], D_train[:,k:]
X_test, y_test = D_test[:,:k], D_test[:,k:]
model = Sequential([keras.layers.Dense(1)])
model.compile(optimizer=tf.keras.optimizers.SGD(lr=1e-1), loss=tf.keras.losses.mean_squared_error)
model.fit(X_train, y_train, batch_size=64, epochs=50)
当我调用 model.get_weights 时,它显示与真实值的显着差异,并且损失仍然不接近于零。我不知道为什么这个模型的性能不如 pytorch 模型。即使您忽略 pytorch 模型,网络也不应该收敛到这个小玩具数据集中的真实值。我在设置模型时犯了什么错误?
编辑:这是我完整的 pytorch 代码进行比较:
import torch
from torch.utils.data import DataLoader, Dataset, Sampler, SequentialSampler, RandomSampler
from torch import nn
from sklearn.model_selection import train_test_split
n = 10
k = 2
device = "cpu"
class Daten(Dataset):
def __init__(self, df):
self.df = df
self.ycol = df.shape[1] - 1
def __getitem__(self, index):
return self.df[index, :self.ycol], self.df[index, self.ycol:]
def __len__(self):
return self.df.shape[0]
def split_into(D, batch_size=64, **kwargs):
D_train, D_test = train_test_split(D, **kwargs)
df_train, df_test = Daten(D_train), Daten(D_test)
dl_train, dl_test = DataLoader(df_train, batch_size=batch_size), DataLoader(df_test, batch_size=batch_size)
return dl_train, dl_test
D = gen_data(n, k)
dl_train, dl_test = split_into(D, test_size=0.2)
class NeuralNetwork(nn.Module):
def __init__(self):
super().__init__()
self.linear = nn.Sequential(
nn.Linear(k, 1)
)
def forward(self, x):
ypred = self.linear(x)
return ypred
model = NeuralNetwork().to(device)
print(model)
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-1)
def train(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
model.train()
for batch, (X, y) in enumerate(dataloader):
X, y = X.to(device), y.to(device)
print(y.shape)
# Compute prediction error
pred = model(X)
loss = loss_fn(pred, y)
# Backpropagation
loss.backward()
optimizer.step()
optimizer.zero_grad()
if batch % 100 == 0:
loss, current = loss.item(), (batch + 1) * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
epochs = 50
for t in range(epochs):
print(f"Epoch {t + 1}\n-------------------------------")
train(dl_train, model, loss_fn, optimizer)
print("Done!")
编辑:
我大幅增加了纪元。epochs=1000 后我们就接近真实值了。因此,我对差异的最佳猜测是 tf 应用了一些非最佳初始化?