# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT finetuning runner."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function


#==================
import csv
import os
import logging
import argparse
import random
import h5py
from tqdm import tqdm, trange
import os
import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Dataset
from torch.utils.data.distributed import DistributedSampler
import math
import time

from tokenization import BertTokenizer
from modeling import BertForPreTraining, BertConfig

# from fused_adam_local import FusedAdamBert
from file_utils import PYTORCH_PRETRAINED_BERT_CACHE

from apex.parallel import DistributedDataParallel as DDP
import torch.distributed as dist

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

class pretraining_dataset(Dataset):

    def __init__(self, input_file, max_pred_length):
        self.input_file = input_file
        self.max_pred_length = max_pred_length
        f = h5py.File(input_file, "r")
        self.input_ids = np.asarray(f["input_ids"][:]).astype(np.int64)#[num_instances x max_seq_length])
        self.input_masks = np.asarray(f["input_mask"][:]).astype(np.int64) #[num_instances x max_seq_length]
        self.segment_ids = np.asarray(f["segment_ids"][:]).astype(np.int64) #[num_instances x max_seq_length]
        self.masked_lm_positions = np.asarray(f["masked_lm_positions"][:]).astype(np.int64) #[num_instances x max_pred_length]
        self.masked_lm_ids= np.asarray(f["masked_lm_ids"][:]).astype(np.int64) #[num_instances x max_pred_length]
        self.next_sentence_labels = np.asarray(f["next_sentence_labels"][:]).astype(np.int64) # [num_instances]
        f.close()

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.input_ids)

    def __getitem__(self, index):
        
        input_ids= torch.from_numpy(self.input_ids[index]) # [max_seq_length]
        input_mask = torch.from_numpy(self.input_masks[index]) #[max_seq_length]
        segment_ids = torch.from_numpy(self.segment_ids[index])# [max_seq_length]
        masked_lm_positions = torch.from_numpy(self.masked_lm_positions[index]) #[max_pred_length]
        masked_lm_ids = torch.from_numpy(self.masked_lm_ids[index]) #[max_pred_length]
        next_sentence_labels = torch.from_numpy(np.asarray(self.next_sentence_labels[index])) #[1]
         
        masked_lm_labels = torch.ones(input_ids.shape, dtype=torch.long) * -1
        index = self.max_pred_length
        # store number of  masked tokens in index
        if len((masked_lm_positions == 0).nonzero()) != 0:
          index = (masked_lm_positions == 0).nonzero()[0].item()
        masked_lm_labels[masked_lm_positions[:index]] = masked_lm_ids[:index]

        return [input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels]

def main():    

    print("IN NEW MAIN XD\n")
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--input_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The input data dir. Should contain .hdf5 files  for the task.")
    parser.add_argument("--config_file",
                        default="bert_config.json",
                        type=str,
                        required=False,
                        help="The BERT model config")
    parser.add_argument("--ckpt_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The ckpt directory, e.g. /results")

    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('--eval', dest='do_eval', action='store_true')
    group.add_argument('--prediction', dest='do_eval', action='store_false')
    ## Other parameters
    parser.add_argument("--bert_model", default="bert-large-uncased", type=str, required=False,
                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
    parser.add_argument("--max_seq_length",
                        default=512,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--max_predictions_per_seq",
                        default=80,
                        type=int,
                        help="The maximum total of masked tokens in input sequence")
    parser.add_argument("--ckpt_step",
                        default=-1,
                        type=int,
                        required=False,
                        help="The model checkpoint iteration, e.g. 1000")
                       
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--max_steps",
                        default=-1,
                        type=int,
                        help="Total number of eval  steps to perform, otherwise use full dataset")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--fp16',
                        default=False,
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")

    
    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl', init_method='env://')
    n_gpu = torch.cuda.device_count()
    if n_gpu > 1:
        assert(args.local_rank != -1) # only use torch.distributed for multi-gpu 
    logger.info("device %s n_gpu %d distributed inference %r", device, n_gpu, bool(args.local_rank != -1))


    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)


    # Prepare model
    config = BertConfig.from_json_file(args.config_file)
    model = BertForPreTraining(config)


    if args.ckpt_step == -1:
        #retrieve latest model
        model_names = [f for f in os.listdir(args.ckpt_dir) if f.endswith(".model")]
        args.ckpt_step = max([int(x.split('.model')[0].split('_')[1].strip()) for x in model_names])
        print("load model saved at iteraton", args.ckpt_step)
    model_file = os.path.join(args.ckpt_dir, "ckpt_" + str(args.ckpt_step) + ".model")
    state_dict = torch.load(model_file, map_location="cpu")
    model.load_state_dict(state_dict, strict=False)

    if args.fp16:
        model.half() # all parameters and buffers are converted to half precision
    model.to(device)

    multi_gpu_training = args.local_rank != -1 and torch.distributed.is_initialized()
    if multi_gpu_training:
        model = DDP(model)
   
    files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f))]
    files.sort()

      
    logger.info("***** Running evaluation *****")
    logger.info("  Batch size = %d", args.eval_batch_size)
    

    model.eval()
    print("Evaluation. . .")
    
    nb_instances = 0
    max_steps = args.max_steps if args.max_steps > 0  else np.inf
    global_step = 0

    
    with torch.no_grad():
        if args.do_eval:
            final_loss = 0.0 # 
            for data_file in files:
                logger.info("file %s" %( data_file))
                dataset = pretraining_dataset(input_file=data_file, max_pred_length=args.max_predictions_per_seq)
                if not multi_gpu_training:
                    train_sampler = RandomSampler(dataset)
                    datasetloader = DataLoader(dataset, sampler=train_sampler, batch_size=args.eval_batch_size, num_workers=4, pin_memory=True)
                else:
                    train_sampler = DistributedSampler(dataset)
                    datasetloader = DataLoader(dataset, sampler=train_sampler, batch_size=args.eval_batch_size, num_workers=4, pin_memory=True)
                for step, batch in enumerate(tqdm(datasetloader, desc="Iteration")):
                    if global_step > max_steps:
                        break


                    batch = [t.to(device) for t in batch]
                    input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch#\
                    loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_lm_labels=masked_lm_labels, next_sentence_label=next_sentence_labels)
                    final_loss += loss

                    global_step += 1

                torch.cuda.empty_cache()
                if global_step > max_steps:
                    break
            final_loss /= global_step
            if multi_gpu_training:
                final_loss /= torch.distributed.get_world_size()
                dist.all_reduce(final_loss)
            if (not multi_gpu_training or (multi_gpu_training and torch.distributed.get_rank() == 0)):       
                logger.info("Finished: Final Loss = {}".format(final_loss))


        else: # inference
            # if multi_gpu_training:
            #     torch.distributed.barrier()
            # start_t0 = time.time()
            for data_file in files:
                logger.info("file %s" %( data_file))
                dataset = pretraining_dataset(input_file=data_file, max_pred_length=args.max_predictions_per_seq)
                if not multi_gpu_training:
                    train_sampler = RandomSampler(dataset)
                    datasetloader = DataLoader(dataset, sampler=train_sampler, batch_size=args.eval_batch_size, num_workers=4, pin_memory=True)
                else:
                    train_sampler = DistributedSampler(dataset)
                    datasetloader = DataLoader(dataset, sampler=train_sampler, batch_size=args.eval_batch_size, num_workers=4, pin_memory=True)
                for step, batch in enumerate(tqdm(datasetloader, desc="Iteration")):
                    if global_step > max_steps:
                        break


                    batch = [t.to(device) for t in batch]
                    input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch#\
                    
                    lm_logits, nsp_logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_lm_labels=None, next_sentence_label=None)

                    nb_instances += input_ids.size(0)


                    global_step += 1
                torch.cuda.empty_cache()
                if global_step > max_steps:
                    break
            # if multi_gpu_training:
            #     torch.distributed.barrier()
            if (not multi_gpu_training or (multi_gpu_training and torch.distributed.get_rank() == 0)):       
                logger.info("Finished")


if __name__ == "__main__":
    main()