@inproceedings{51b80f03fe7f43859da3bb32ff6e0674,
title = "Stargate: Remote data access between hadoop clusters",
abstract = "The transfer of large-scale datasets between geographically separated systems is a challenge in scientific computing, made even more complicated when the systems are clusters of computers. In this paper we present Stargate, a file system that enables efficient on-demand remote data access for Hadoop-based scientific computations. Stargate uses a content-addressable protocol, on-demand access, and multi-tier caching to address the challenges of large data transfers over a WAN. Stargate also uses a novel approach that co-locates computations and transfers to achieve efficient data access in cluster computing. Unlike other approaches, Stargate is implemented as an independent file system service that works with any computation framework. In our experiments Stargate's performance on heavy I/O workloads was 7\% faster than WebHDFS and only 8\% slower than HDFS. In addition, Stargate's caches effectively trade high-cost WAN traffic for low-cost LAN traffic. Stargate's performance, on-demand data access, and reduction in WAN traffic make it a good platform for providing remote dataset access to scientific computations on clusters.",
keywords = "WAN, WAN file system, cluster-to-cluster data transfer, file system, on-demand remote data access, remote data access, wide-area network",
author = "Illyoung Choi and Hartman, \{John H.\}",
note = "Publisher Copyright: {\textcopyright} 2021 ACM.; 36th Annual ACM Symposium on Applied Computing, SAC 2021 ; Conference date: 22-03-2021 Through 26-03-2021",
year = "2021",
month = mar,
day = "22",
doi = "10.1145/3412841.3441635",
language = "English (US)",
series = "Proceedings of the ACM Symposium on Applied Computing",
publisher = "Association for Computing Machinery",
pages = "32--39",
booktitle = "Proceedings of the 36th Annual ACM Symposium on Applied Computing, SAC 2021",
}