arXiv_Computer_Vision_and_Pattern_Recognition_2016
/
AdaptiveAttention
mirror of https://github.com/jiasenlu/AdaptiveAttention

 
			
							require 'torch'
require 'nn'
require 'nngraph'
-- local imports
require 'visu.DataLoaderResNetEval'
local utils = require 'misc.utils'
require 'visu.LanguageModel_visu'
local net_utils = require 'misc.net_utils'
require 'misc.optim_updates'
require 'gnuplot'
require 'xlua'
-------------------------------------------------------------------------------
-- Input arguments and options
-------------------------------------------------------------------------------
cmd = torch.CmdLine()
cmd:text()
cmd:text('Train an Image Captioning model')
cmd:text()
cmd:text('Options')


-- Model settings
--[[
cmd:option('-dataset','flickr30k','')
cmd:option('-input_h5','/data/flickr30k/cocotalk.h5','path to the h5file containing the preprocessed dataset')
cmd:option('-input_json','/data/flickr30k/cocotalk.json','path to the json file containing additional info and vocab')
cmd:option('-cnn_model','../image_model/resnet-152.t7','path to CNN model file containing the weights, Caffe format. Note this MUST be a VGGNet-16 right now.')
]]--

cmd:option('-input_h5','/data/coco/cocotalk.h5','path to the h5file containing the preprocessed dataset')
cmd:option('-input_json','/data/coco/cocotalk.json','path to the json file containing additional info and vocab')
cmd:option('-cnn_model','../image_model/resnet-152.t7','path to CNN model file containing the weights, Caffe format. Note this MUST be a VGGNet-16 right now.')
cmd:option('-checkpoint_path', 'save/coco_val_1', 'folder to save checkpoints into (empty = this folder)')

--[[
cmd:option('-input_h5','/data/coco/cocotalk_test.h5','path to the h5file containing the preprocessed dataset')
cmd:option('-input_json','/data/coco/cocotalk_test.json','path to the json file containing additional info and vocab')
cmd:option('-input_vocab_json','/data/coco/cocotalk.json','path to the json file containing additional info and vocab')
cmd:option('-cnn_model','../image_model/resnet-152.t7','path to CNN model file containing the weights, Caffe format. Note this MUST be a VGGNet-16 right now.')
]]--
cmd:option('-start_from', 'model_id1_36.t7', 'path to a model checkpoint to initialize model weights from. Empty = don\'t')
cmd:option('-beam_size', 3, 'Evaluate language as well (1 = yes, 0 = no)? BLEU/CIDEr/METEOR/ROUGE_L? requires coco-caption code from Github.')
--cmd:option('-checkpoint_path', 'save/flickr30k_512x1_1', 'folder to save checkpoints into (empty = this folder)')

cmd:option('-drop_prob_lm', 0.5, 'strength of dropout in the Language Model RNN')
cmd:option('-rnn_size',512,'size of the rnn in number of hidden nodes in each layer')
cmd:option('-num_layers',1,'the encoding size of each token in the vocabulary, and the image.')
cmd:option('-input_encoding_size',512,'the encoding size of each token in the vocabulary, and the image.')
cmd:option('-batch_size',10,'what is the batch size in number of images per batch? (there will be x seq_per_img sentences)')

cmd:option('-fc_size',2048,'the encoding size of each token in the vocabulary, and the image.')
cmd:option('-conv_size',2048,'the encoding size of each token in the vocabulary, and the image.')
cmd:option('-seq_per_img',5,'number of captions to sample for each image during training. Done for efficiency since CNN forward pass is expensive. E.g. coco has 5 sents/image')

cmd:option('-val_images_use', -1, 'how many images to use when periodically evaluating the validation loss? (-1 = all)')
cmd:option('-save_checkpoint_every', 3, 'how often to save a model checkpoint?')
cmd:option('-language_eval', 0, 'Evaluate language as well (1 = yes, 0 = no)? BLEU/CIDEr/METEOR/ROUGE_L? requires coco-caption code from Github.')

-- misc
cmd:option('-backend', 'cudnn', 'nn|cudnn')
cmd:option('-id', '1', 'an id identifying this run/job. used in cross-val and appended when writing progress files')
cmd:option('-seed', 123, 'random number generator seed to use')
cmd:option('-gpuid', 0, 'which gpu to use. -1 = use CPU')

cmd:text()

-------------------------------------------------------------------------------
-- Basic Torch initializations
-------------------------------------------------------------------------------
local opt = cmd:parse(arg)
--torch.manualSeed(opt.seed)
torch.setdefaulttensortype('torch.FloatTensor') -- for CPU

if opt.gpuid >= 0 then
  require 'cutorch'
  require 'cunn'
  if opt.backend == 'cudnn' then require 'cudnn' end
  --cutorch.manualSeed(opt.seed)
  cutorch.setDevice(opt.gpuid + 1) -- note +1 because lua is 1-indexed
end

-------------------------------------------------------------------------------
-- Create the Data Loader instance
-------------------------------------------------------------------------------
local loader = DataLoader{h5_file = opt.input_h5, json_file = opt.input_json, neighbor_h5 = opt.nn_neighbor, 
                  batch_size = opt.batch_size, seq_per_img = opt.seq_per_img, thread_num = opt.thread_num}
--local loader = DataLoader{h5_file = opt.input_h5, json_file = opt.input_json, vocab_json_file = opt.input_vocab_json,neighbor_h5 = opt.nn_neighbor, 
--                  batch_size = opt.batch_size, seq_per_img = opt.seq_per_img, thread_num = opt.thread_num}

-------------------------------------------------------------------------------
-- Initialize the networks
-------------------------------------------------------------------------------
-- create protos from scratch
-- intialize language model
local lmOpt = {}
lmOpt.vocab_size = loader:getVocabSize()
lmOpt.input_encoding_size = opt.input_encoding_size
lmOpt.rnn_size = opt.rnn_size
lmOpt.num_layers = opt.num_layers
lmOpt.dropout = opt.drop_prob_lm
lmOpt.seq_length = loader:getSeqLength()
lmOpt.batch_size = opt.batch_size * opt.seq_per_img
lmOpt.fc_size = opt.fc_size
lmOpt.conv_size = opt.conv_size

local loaded_checkpoint
if opt.start_from ~= '' then -- just copy to gpu1 params
  local loaded_checkpoint_path = path.join(opt.checkpoint_path, opt.start_from)
  print(loaded_checkpoint_path)
  loaded_checkpoint = torch.load(loaded_checkpoint_path)
end

-- iterate over different gpu
local protos = {}

protos.lm = nn.LanguageModel(lmOpt):cuda()
-- initialize the ConvNet 
if opt.start_from ~= '' then -- just copy to gpu1 params
  protos.cnn_conv_fix = loaded_checkpoint.protos.cnn_conv_fix:cuda()
  protos.cnn_conv = loaded_checkpoint.protos.cnn_conv:cuda()
  protos.cnn_fc = loaded_checkpoint.protos.cnn_fc:cuda()
else
  local cnn_raw = torch.load(opt.cnn_model)

  protos.cnn_conv_fix = net_utils.build_residual_cnn_conv_fix(cnn_raw, 
                      {backend = cnn_backend, start_layer_num = opt.finetune_start_layer}):cuda()

  protos.cnn_conv = net_utils.build_residual_cnn_conv(cnn_raw, 
                      {backend = cnn_backend, start_layer_num = opt.finetune_start_layer}):cuda()

  protos.cnn_fc = net_utils.build_residual_cnn_fc(cnn_raw, 
                      {backend = cnn_backend}):cuda()
end
protos.expanderConv = nn.FeatExpanderConv(opt.seq_per_img):cuda()
protos.expanderFC = nn.FeatExpander(opt.seq_per_img):cuda()
protos.transform_cnn_conv = net_utils.transform_cnn_conv(opt.conv_size):cuda()
-- criterion for the language model
protos.crit = nn.LanguageModelCriterion():cuda()

params, grad_params = protos.lm:getParameters()
cnn1_params, cnn1_grad_params = protos.cnn_conv:getParameters()

print('total number of parameters in LM: ', params:nElement())
print('total number of parameters in CNN_conv: ', cnn1_params:nElement())

assert(params:nElement() == grad_params:nElement())
assert(cnn1_params:nElement() == cnn1_grad_params:nElement())

if opt.start_from ~= '' then -- just copy to gpu1 params
  params:copy(loaded_checkpoint.lmparam)
end

protos.lm:createClones()

-- create clones and ensure parameter sharing. we have to do this 
-- all the way here at the end because calls such as :cuda() and
-- :getParameters() reshuffle memory around.

collectgarbage() -- "yeah, sure why not"
-------------------------------------------------------------------------------
-- Evaluation fun(ction)
-------------------------------------------------------------------------------
local function evaluate_split(split, evalopt)
  local val_images_use = utils.getopt(evalopt, 'val_images_use', -1)

  print('=> evaluating ...')

  -- setting to the evaluation mode, use only the first gpu
  protos.cnn_conv:evaluate()
  protos.cnn_fc:evaluate()
  protos.lm:evaluate()
  protos.cnn_conv_fix:evaluate()

  local n = 0
  local loss_sum = 0
  local loss_evals = 0
  local predictions = {}
  local vocab = loader:getVocab()
  local imgId_cell = {}

  local nbatch = math.ceil(val_images_use / opt.batch_size)
  if val_images_use == -1 then
    nbatch = loader:getnBatch(split)
  end

  loader:init_rand(split)
  loader:reset_iterator(split)

  local atten_out_all = torch.FloatTensor(loader:getSeqLength()+1, 5*nbatch*opt.batch_size, 50):zero()
  --for n, data in loader:run({split = split, size_image_use = val_images_use}) do
  for n = 1, nbatch do
    local data = loader:run({split = split, size_image_use = val_images_use})
    xlua.progress(n,nbatch)

    -- convert the data to cuda
    data.images = data.images:cuda()
    data.labels = data.labels:cuda()

    -- forward the model to get loss
    local feats_conv_fix = protos.cnn_conv_fix:forward(data.images)

    local feats_conv = protos.cnn_conv:forward(feats_conv_fix)
    local feat_conv_t = protos.transform_cnn_conv:forward(feats_conv)
    local feats_fc = protos.cnn_fc:forward(feats_conv)    

    local expanded_feats_conv = protos.expanderConv:forward(feat_conv_t)
    local expanded_feats_fc = protos.expanderFC:forward(feats_fc)
    
    local logprobs, atten = protos.lm:forward({expanded_feats_conv, expanded_feats_fc, data.labels})
    --local loss = protos.crit:forward({logprobs, data.labels})
    --loss_sum = loss_sum + loss
    --loss_evals = loss_evals + 1

    -- forward the model to also get generated samples for each image
    local sampleOpt = {beam_size = opt.beam_size}
    --local seq, atten = protos.lm:sample({feat_conv_t, feats_fc, vocab}, sampleOpt)
    local sents, count = net_utils.decode_sequence(vocab,  data.labels)

    local s = (n-1)*opt.batch_size*5+1
    atten_out_all:narrow(2,s,opt.batch_size*5):copy(atten)

    for k=1,#sents do
      local idx = math.floor((k-1)/5)+1
      local img_id = data.img_id[idx]
      local entry
      --if imgId_cell[img_id] == nil then -- make sure there are one caption for each image.
        --imgId_cell[img_id] = 1
        local prob_tmp = {}
        for m = 1, count[k] do
          table.insert(prob_tmp, 1-atten[m][k][1])
        end
        entry = {image_id = img_id, caption = sents[k], prob = prob_tmp}
        table.insert(predictions, entry)
      --end
    end
  end
  local lang_stats
  if opt.language_eval == 1 then
    lang_stats = net_utils.language_eval(predictions, {id = opt.id, dataset = opt.dataset})
  end

  return predictions, lang_stats, atten_out_all
end

local split_predictions, lang_stats, atten_out_all = evaluate_split('test', {val_images_use = opt.val_images_use, verbose = opt.verbose})

if lang_stats then
  print(lang_stats)
end

utils.write_json('visu_gt_test.json', split_predictions)
torch.save('atten_gt_test_1.t7', atten_out_all)