@inproceedings{5e41e53362674bdd865f102c7eca1554,
title = "Ensemble of One Model: Creating Model Variations for Transformer with Layer Permutation",
abstract = "Ensemble involves combining the outputs of multiple models to increase performance. This technique has enjoyed great success across many fields in machine learning. This study focuses on a novel approach to increase performance of a model without any increase in number of parameters. The proposed approach involves training a model that can have different variations that perform well and different enough for ensemble. The variations are created by changing the order of the layers of a machine learning model. Moreover, this method can be combined with existing ensemble technique to further improve the performance. The task chosen for evaluating the performance is machine translation with Transformer, as Transformer is the current state-of-the-art model for this task as well as many natural language processing tasks. The IWSLT 2014 German to English and French to English datasets see an increase of at least 0.7 BLEU score over single model baseline with the same model size. When combined with multiple model ensemble, minimum increase of 0.3 BLEU is observed with no increase in parameters.",
author = "Andrew Liaw and Hsu, \{Jia Hao\} and Wu, \{Chung Hsien\}",
note = "Publisher Copyright: {\textcopyright} 2021 APSIPA.; 2021 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference, APSIPA ASC 2021 ; Conference date: 14-12-2021 Through 17-12-2021",
year = "2021",
language = "English",
series = "2021 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference, APSIPA ASC 2021 - Proceedings",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "1026--1030",
booktitle = "2021 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference, APSIPA ASC 2021 - Proceedings",
address = "United States",
}