@article{07abfe7d28504c3da705e9e8cbe17945,
title = "Mitigating Inter-Job Interference via Process-Level Quality-of-Service",
abstract = "Jobs on most high-performance computing (HPC) systems share the network with other concurrently executing jobs. Network sharing leads to contention that can severely degrade performance. This article investigates the use of Quality of Service (QoS) mechanisms to reduce the negative impacts of network contention. QoS allows users to manage resource sharing between network flows and to provide bandwidth guarantees to specific flows. Our results show that careful use of QoS reduces the impact of network contention for specific jobs, resulting in up to a 40% performance improvement. In some cases, it completely eliminates the impact of contention. It achieves these improvements with limited negative impact to other jobs; any job that experiences performance loss typically degrades less than 5%, and often much less. Our approach can help ensure that HPC machines maintain high levels of throughput as per-node compute power continues to increase faster than network bandwidth. ",
keywords = "High-performance computing, network contention, quality of service",
author = "Lee Savoie and Lowenthal, {David K.} and {De Supinski}, {Bronis R.} and Kathryn Mohror and Nikhil Jain",
note = "Funding Information: A short paper with the same title was published in IEEE International Conference on Cluster Computing, 2019 [Savoie et al. 2019]. This work was performed under the auspices of the U.S. Department of Energy by Lawrence Livermore National Laboratory under Contract DE-AC52-07NA27344 (LLNL-JRNL-813440). This material is based upon work supported by the National Science Foundation under Grant No. 1526015. Authors{\textquoteright} addresses: L. Savoie, Thermopylae Sciences + Technology, 1911 N. Fort Myer Dr. Suite 700, Arlington, VA 22209; email: lsavoie@cs.arizona.edu; D. K. Lowenthal, Department of Computer Science, 1040 E. 4th Street, P.O. Box 210077, Tucson AZ 85721; email: dkl@cs.arizona.edu; B. R. de Supinski and K. Mohror, Lawrence Livermore National Laboratory, P.O. Box 808, L-557, Livermore, CA 94551-0808; emails: {bronis, mohror1}@llnl.gov; N. Jain, Nvidia Corporation, Santa Clara, California, 95050; email: nikhil.jain@acm.org. Permission to make digital or hard copies of all or part of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for components of this work owned by others than ACM must be honored. Abstracting with credit is permitted. To copy otherwise, or republish, to post on servers or to redistribute to lists, requires prior specific permission and/or a fee. Request permissions from permissions@acm.org. {\textcopyright} 2020 Association for Computing Machinery. 2329-4949/2020/01-ART1 $15.00 https://doi.org/10.1145/3434397 Publisher Copyright: {\textcopyright} 2021 ACM.",
year = "2021",
month = apr,
doi = "10.1145/3434397",
language = "English (US)",
volume = "8",
journal = "ACM Transactions on Parallel Computing",
issn = "2329-4949",
publisher = "Association for Computing Machinery (ACM)",
number = "1",
}