@inproceedings{62947bb3400845c3be93ddda88ed242e,
title = "Topic model methods for automatically identifying out-of-scope resources",
abstract = "Recent years have seen the rise of subject-themed digital libraries, such as the NSDL pathways and the Digital Library for Earth System Education (DLESE). These libraries often need to manually verify that contributed resources cover top- ics that fit within the theme of the library. We show that such scope judgments can be automated using a combination of text classification techniques and topic modeling. Our models address two significant challenges in making scope judgments: only a small number of out-of-scope resources are typically available, and the topic distinctions required for digital libraries are much more subtle than classic text classification problems. To meet these challenges, our mod- els combine support vector machine learners optimized to diffierent performance metrics and semantic topics induced by unsupervised statistical topic models. Our best model\textbackslash{} is able to distinguish resources that belong in DLESE from resources that don't with an accuracy of around 70\%. We see these models as the first steps towards increasing the scalability of digital libraries and dramatically reducing the workload required to maintain them.",
keywords = "Digital libraries, Machine learning, Relevance, Scope, Topics",
author = "Steven Bethard and Soumya Ghosh and Martin, \{James H.\} and Tamara Sumner",
year = "2009",
doi = "10.1145/1555400.1555405",
language = "English (US)",
isbn = "9781605586977",
series = "Proceedings of the ACM/IEEE Joint Conference on Digital Libraries",
pages = "19--28",
booktitle = "JCDL'09 - Proceedings of the 2009 ACM/IEEE Joint Conference on Digital Libraries",
note = "2009 ACM/IEEE Joint Conference on Digital Libraries, JCDL'09 ; Conference date: 15-06-2009 Through 19-06-2009",
}