Auto Byte

Science AI

# 用100元的支票骗到100万：看看对抗性攻击是怎么为非作歹的

Inception v3 分类器

“真”和“假”之间的边界几乎是线性的。我们可以从中得到两个有趣的结论。首先，如果你沿着梯度的方向进行计算, 一旦碰到了预测的类别改变的区域， 就可以确认攻击成功了。另一方面，它表明了，决策函数的结构远比大多数研究者想象的容易。

``mport torchfrom torch import nnfrom torch.autograd import Variablefrom torch.autograd.gradcheck import zero_gradientsimport torchvision.transforms as Tfrom torchvision.models.inception import inception_v3from PIL import Imageimport matplotlib.pyplot as pltimport numpy as np``

``classes = eval(open('classes.txt').read())trans = T.Compose([T.ToTensor(), T.Lambda(lambda t: t.unsqueeze(0))])reverse_trans = lambda x: np.asarray(T.ToPILImage()(x))``

``eps = 2 * 8 / 225.steps = 40norm = float('inf')step_alpha = 0.0001model = inception_v3(pretrained=True, transform_input=True).cuda()loss = nn.CrossEntropyLoss()model.eval();``

``def load_image(img_path):    img = trans(Image.open(img_path).convert('RGB'))    return img``

``def get_class(img):    x = Variable(img, volatile=True).cuda()    cls = model(x).data.max(1)[1].cpu().numpy()[0]    return classes[cls]``

``def draw_result(img, noise, adv_img):    fig, ax = plt.subplots(1, 3, figsize=(15, 10))    orig_class, attack_class = get_class(img), get_class(adv_img)    ax[0].imshow(reverse_trans(img[0]))    ax[0].set_title('Original image: {}'.format(orig_class.split(',')[0]))    ax[1].imshow(noise[0].cpu().numpy().transpose(1, 2, 0))    ax[1].set_title('Attacking noise')    ax[2].imshow(reverse_trans(adv_img[0]))    ax[2].set_title('Adversarial example: {}'.format(attack_class))    for i in range(3):        ax[i].set_axis_off()    plt.tight_layout()    plt.show()``

``def non_targeted_attack(img):    img = img.cuda()    label = torch.zeros(1, 1).cuda()    x, y = Variable(img, requires_grad=True), Variable(label)    for step in range(steps):        zero_gradients(x)        out = model(x)        y.data = out.data.max(1)[1]        _loss = loss(out, y)        _loss.backward()        normed_grad = step_alpha * torch.sign(x.grad.data)        step_adv = x.data + normed_grad        adv = step_adv - img        adv = torch.clamp(adv, -eps, eps)        result = img + adv        result = torch.clamp(result, 0.0, 1.0)        x.data = result    return result.cpu(), adv.cpu()``

``img = load_image('input.png')adv_img, noise = non_targeted_attack(img)draw_result(img, noise, adv_img)``

``def targeted_attack(img, label):       img = img.cuda()label = torch.Tensor([label]).long().cuda()       x, y = Variable(img, requires_grad=True),   Variable(label)       for step in range(steps):           zero_gradients(x)           out = model(x)           _loss = loss(out, y)           _loss.backward()           normed_grad = step_alpha * torch.sign(x.grad.data)           step_adv = x.data - normed_grad           adv = step_adv - img           adv = torch.clamp(adv, -eps, eps)           result = img + adv           result = torch.clamp(result, 0.0, 1.0)           x.data = result       return result.cpu(), adv.cpu()``

`step_adv = x.data - normed_grad`