@inproceedings{d7762edce4c64d78854116dc816a6dca,
title = "Mitigating Inter-Job Interference via Process-Level Quality-of-Service",
abstract = "Jobs on most high-performance computing (HPC) systems share the network with other concurrently executing jobs. This sharing creates contention that can severely degrade performance. We investigate the use of Quality of Service (QoS) mechanisms to reduce the negative impacts of network contention. Our results show that careful use of QoS reduces the impact of contention for specific jobs, resulting in up to a 27% performance improvement. In some cases the impact of contention is completely eliminated. These improvements are achieved with limited negative impact to other jobs; any job that experiences performance loss typically degrades less than 5%, often much less. Our approach can help ensure that HPC machines maintain high throughput as per-node compute power continues to increase faster than network bandwidth.",
keywords = "MPI, network contention, quality of service",
author = "Lee Savoie and Lowenthal, {David K.} and {De Supinski}, {Bronis R.} and Kathryn Mohror and Nikhil Jain",
note = "Funding Information: This work was performed under the auspices of the U.S. Department of Energy by Lawrence Livermore National Laboratory under Contract DE-AC52-07NA27344 (LLNL-CONF-787578). In addition, this material is based upon work supported by the National Science Foundation under Grant No. 1526015. Publisher Copyright: {\textcopyright} 2019 IEEE.; 2019 IEEE International Conference on Cluster Computing, CLUSTER 2019 ; Conference date: 23-09-2019 Through 26-09-2019",
year = "2019",
month = sep,
doi = "10.1109/CLUSTER.2019.8891007",
language = "English (US)",
series = "Proceedings - IEEE International Conference on Cluster Computing, ICCC",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
booktitle = "Proceedings - 2019 IEEE International Conference on Cluster Computing, CLUSTER 2019",
}