bibfile.bib

@misc{howard2018universal,
    title={Universal Language Model Fine-tuning for Text Classification},
    author={Jeremy Howard and others},
    year={2018},
    eprint={1801.06146},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}
@misc{vaswani2017attention,
    title={Attention Is All You Need},
    author={Ashish Vaswani and others},
    year={2017},
    eprint={1706.03762},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}
@misc{sun2019finetune,
    title={How to Fine-Tune BERT for Text Classification?},
    author={Chi Sun and others},
    year={2019},
    eprint={1905.05583},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}
@inproceedings{Gray2017GPUKF,
  title={GPU Kernels for Block-Sparse Weights},
  author={Scott Gray and others},
  year={2017}
}
@misc{yang2019xlnet,
    title={XLNet: Generalized Autoregressive Pretraining for Language Understanding},
    author={Zhilin Yang and others},
    year={2019},
    eprint={1906.08237},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}

@misc{devlin2018bert,
    title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
    author={Jacob Devlin and others},
    year={2018},
    eprint={1810.04805},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}

@article{radford2019language,
  title={Language Models are Unsupervised Multitask Learners},
  author={Radford, Alec and  others},
  year={2019}
}

@misc{shoeybi2019megatronlm,
    title={Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism},
    author={Mohammad Shoeybi and others},
    year={2019},
    eprint={1909.08053},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}

@article{hochreiter1997long,
  added-at = {2016-11-15T08:49:43.000+0100},
  author = {Hochreiter, Sepp and others},
  biburl = {https://www.bibsonomy.org/bibtex/2a4a80026d24955b267cae636aa8abe4a/dallmann},
  interhash = {0692b471c4b9ae65d00affebc09fb467},
  intrahash = {a4a80026d24955b267cae636aa8abe4a},
  journal = {Neural computation},
  keywords = {lstm rnn},
  number = 8,
  pages = {1735--1780},
  publisher = {MIT Press},
  timestamp = {2016-11-15T08:49:43.000+0100},
  title = {Long short-term memory},
  volume = 9,
  year = 1997
}

@misc{wang2018glue,
    title={GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding},
    author={Alex Wang and others},
    year={2018},
    eprint={1804.07461},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}

@misc{peters2018deep,
    title={Deep contextualized word representations},
    author={Matthew E. Peters and others},
    year={2018},
    eprint={1802.05365},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}

@inproceedings{wordPiece,
    title	= {Japanese and Korean Voice Search},
    author	= {Mike Schuster and others},
    year	= {2012},
    booktitle	= {International Conference on Acoustics, Speech and Signal Processing},
    pages	= {5149--5152}
}

@incollection{mikolov2013,
    title = {Distributed Representations of Words and Phrases and their Compositionality},
    author = {Mikolov, Tomas and others},
    booktitle = {Advances in Neural Information Processing Systems 26},
    editor = {C. J. C. Burges and L. Bottou and M. Welling and Z. Ghahramani and K. Q. Weinberger},
    pages = {3111--3119},
    year = {2013},
    publisher = {Curran Associates, Inc.},
    url = {http://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf}
}

@inproceedings{penningtonglove,
    title = "{G}love: Global Vectors for Word Representation",
    author = "Pennington, Jeffrey  and
      others",
    booktitle = "Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing ({EMNLP})",
    month = oct,
    year = "2014",
    address = "Doha, Qatar",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/D14-1162",
    doi = "10.3115/v1/D14-1162",
    pages = "1532--1543",
}

@inproceedings{Radford2018ImprovingLU,
  title={Improving Language Understanding by Generative Pre-Training},
  author={Alec Radford and others},
  year={2018}
}

@inproceedings{mikolov2018advances,
  title={Advances in Pre-Training Distributed Word Representations},
  author={Mikolov, Tomas and others},
  booktitle={Proceedings of the International Conference on Language Resources and Evaluation (LREC 2018)},
  year={2018}
}