@inproceedings{f472ab4c83fb42799530bd0b9b653495,
title = "Developing language-tagged corpora for code-switching tweets",
abstract = "Code-switching, where a speaker switches between languages mid-utterance, is frequently used by multilingual populations worldwide. Despite its prevalence, limited effort has been devoted to develop computational approaches or even basic linguistic resources to support research into the processing of such mixed-language data. We present a user-centric approach to collecting code-switched utterances from social media posts, and develop language universal guidelines for the annotation of code-switched data. We also present results for several baseline language identification models on our corpora and demonstrate that language identification in code-switched text is a difficult task that calls for deeper investigation.",
author = "Suraj Maharjan and Elizabeth Blair and Steven Bethard and Thamar Solorio",
note = "Publisher Copyright: {\textcopyright} 2015 Association for Computational Linguistics; 9th Linguistic Annotation Workshop, LAW 2015, held in conjuncion with NAACL 2015 ; Conference date: 05-06-2015",
year = "2020",
language = "English (US)",
series = "LAW 2015 - 9th Linguistic Annotation Workshop, held in conjuncion with NAACL 2015 - Proceedings of the Workshop",
publisher = "Association for Computational Linguistics (ACL)",
pages = "72--84",
editor = "Adam Meyers and Ines Rehbein and Heike Zinsmeister",
booktitle = "LAW 2015 - 9th Linguistic Annotation Workshop, held in conjuncion with NAACL 2015 - Proceedings of the Workshop",
}