@inproceedings{99d6ac50e2f449859f92c3258b56fb39,
title = "MathAlign: Linking formula identifiers to their contextual natural language descriptions",
abstract = "Extending machine reading approaches to extract mathematical concepts and their descriptions is useful for a variety of tasks, ranging from mathematical information retrieval to increasing accessibility of scientific documents for the visually impaired. This entails segmenting mathematical formulae into identifiers and linking them to their natural language descriptions. We propose a rule-based approach for this task, which extracts LATEX representations of formula identifiers and links them to their in-text descriptions, given only the original PDF and the location of the formula of interest. We also present a novel evaluation dataset for this task, as well as the tool used to create it. The data and the source code are open source and are available at https://osf.io/bdxmr/ and https://github.com/ml4ai/automates, respectively.",
keywords = "Corpus creation, Machine reading, Math information retrieval, Relation extraction, Tool creation",
author = "Maria Alexeeva and Rebecca Sharp and Valenzuela-Esc{\'a}rcega, {Marco A.} and Jennifer Kadowaki and Adarsh Pyarelal and Clayton Morrison",
note = "Funding Information: We thank the anonymous reviewers for their constructive feedback. This work is supported by the Defense Advanced Research Projects Agency (DARPA) as part of the Automated Scientific Knowledge Extraction (ASKE) program under agreement number HR00111990011. Marco Valenzuela-Esc{\'a}rcega declares a financial interest in LUM.AI. This interest has been properly disclosed to the University of Arizona Institutional Review Committee and is managed in accordance with its conflict of interest policies. Publisher Copyright: {\textcopyright} European Language Resources Association (ELRA), licensed under CC-BY-NC; 12th International Conference on Language Resources and Evaluation, LREC 2020 ; Conference date: 11-05-2020 Through 16-05-2020",
year = "2020",
language = "English (US)",
series = "LREC 2020 - 12th International Conference on Language Resources and Evaluation, Conference Proceedings",
publisher = "European Language Resources Association (ELRA)",
pages = "2204--2212",
editor = "Nicoletta Calzolari and Frederic Bechet and Philippe Blache and Khalid Choukri and Christopher Cieri and Thierry Declerck and Sara Goggi and Hitoshi Isahara and Bente Maegaard and Joseph Mariani and Helene Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis",
booktitle = "LREC 2020 - 12th International Conference on Language Resources and Evaluation, Conference Proceedings",
}