@inproceedings{c545a143be1840548458158dfa51ee32,
title = "Metadata Enhancement Using Large Language Models",
abstract = "In the natural sciences, a common form of scholarly document is a physical sample record, which provides categorical and textual metadata for specimens collected and analyzed for scientific research. Physical sample archives like museums and repositories publish these records in data repositories to support reproducible science and enable the discovery of physical samples. However, the success of resource discovery in such interfaces depends on the completeness of the sample records. We investigate approaches for automatically completing the scientific metadata fields of sample records. We apply large language models in zero and few-shot settings and incorporate the hierarchical structure of the taxonomy. We show that a combination of record summarization, bottom-up taxonomy traversal, and few-shot prompting yield an F1 score as high as 0.928 on metadata completion in the Earth science domain.",
author = "Hyunju Song and Steven Bethard and Thomer, {Andrea K.}",
note = "Publisher Copyright: {\textcopyright} 2024 Association for Computational Linguistics.; 4th Workshop on Scholarly Document Processing, SDP 2024 at ACL 2024 ; Conference date: 16-08-2024",
year = "2024",
language = "English (US)",
series = "SDP 2024 - 4th Workshop on Scholarly Document Processing, Proceedings of the Workshop",
publisher = "Association for Computational Linguistics (ACL)",
pages = "145--154",
editor = "Tirthankar Ghosal and Amanpreet Singh and {de Waard}, Anita and Philipp Mayr and Aakanksha Naik and Orion Weller and Yoonjoo Lee and Shannon Shen and Yanxia Qin",
booktitle = "SDP 2024 - 4th Workshop on Scholarly Document Processing, Proceedings of the Workshop",
}