@inproceedings{3b4dcc29a40646cb8b28858ab38a429b,
title = "Keep it Local: Comparing Domain-Specific LLMs in Native and Machine Translated Text using Parallel Corpora on Political Conflict",
abstract = "The dynamics of political conflict and cooperation require powerful computerized tools capable of effectively tracking security threats and cooperation around the world. This study compares the performance of domain-specific Large Language Models (LLMs) against generically-trained LLMs in binary and multi-class classification using native text in English, Spanish, and Arabic, and their corresponding machine translations. This endeavor yields four key contributions. 1) We present and make available a novel database of annotations using a multi-lingual parallel corpus from the United Nations. 2) Using various metrics, we assess the quality of different machine translation tools. 3) Our results indicate that the ConfliBERT family of LLMs, a set of domain-specific models tailored for political conflict, outperform generically-trained LLMs in English, Spanish, and Arabic in both binary and multi-class tasks. 4) We also disentangle the heterogeneous effects of machine translation on LLM performance in different languages. Overall, results reveal the comparative advantage of native-language domain-specific LLMs specialized on political conflict to understand the dynamics of violence and cooperation worldwide using native text. Our multi-lingual ConfliBERT LLMs provide critical cyber-infrastructure enabling scholars and government agencies use their local languages and information to foster safer, more stable political environments.",
keywords = "machine translation, Multilingual LLMs, political conflict, United Nations",
author = "Javier Osorio and Sultan Alsarra and Amber Converse and Afraa Alshammari and Dagmar Heintze and Latifur Khan and Naif Alatrush and Brandt, {Patrick T.} and Vito D'orazio and Niamat Zawad and Mahrusa Billah",
note = "Publisher Copyright: {\textcopyright} 2024 IEEE.; 2nd International Conference on Foundation and Large Language Models, FLLM 2024 ; Conference date: 26-11-2024 Through 29-11-2024",
year = "2024",
doi = "10.1109/FLLM63129.2024.10852489",
language = "English (US)",
series = "2024 2nd International Conference on Foundation and Large Language Models, FLLM 2024",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "542--552",
editor = "Yaser Jararweh and Jim Jansen and Mohammad Alsmirat",
booktitle = "2024 2nd International Conference on Foundation and Large Language Models, FLLM 2024",
}