@article{0032441e37bb4db1821563ae26f30205,
title = "Reliability Analysis in Distributed Systems",
abstract = "Reliability of a distributed processing system is an important design parameter that can be described in terms of the reliability of processing elements and communication links and also of the redundancy of programs and data files. The traditional terminal-pair reliability does not capture the redundancy of programs and files in a distributed system. Two reliability measures are introduced which are distributed program reliability that describes the probability of successful execution of a program requiring cooperation of several computers, and distributed system reliability which is the probability that all the specified distributed programs for the system are operational. These two reliability measures can be extended to incorporate the effects of user sites on reliability. We develop an efficient unified approach based on graph traversal to evaluate the proposed reliability measures.",
keywords = "Allocation of files, distributed, distributed program, file spanning tree, reliability analysis, system",
author = "Raghavendra, {C. S.} and {Prasanna Kumar}, {V. K.} and S. Hariri",
note = "Funding Information: Distributed processing involves cooperation among several loosely coupled computers communicating over a subnetwork. Distributed systems provide cost-effective means for resource sharing and extensibility, and obtain potential increases in performance, reliability, and fault tolerance 151, 161, 181, [9], [23], [24]. Performance improvement is possible due to multiple computers cooperatively executing common tasks, and increased reliability is achievable because of the redundancy of resources. Several issues of such systems, namely, process management, load balancing, file management, access control, distributed algorithms, etc., are under wide-spread investigation 151, [7], [9], 1141-[17], [19], 1201. A distributed program usually requires one or more of the resources such as PE{\textquoteright}s, data files, etc., for successful execution. For successful completion of a program, the local host, the processing elements having the required files, and the interconnecting links must all be operational. With processing elements and communication links having a certain probability of being operational, there is a certain probability associated with the event that a program can be successfully executed. Also the distribution of data file can affect the overall reliability of the system. Thus, an important problem in distributed system design and analysis is to define and efficiently evaluate various reliability measures as well as estimate the effect of Manuscript received October 7, 1985; revised December 11, 1986. This work was supported in part by an NSF Presidential Young Investigator Award DCI-8452003 and a Grant from AT&T Information Sys:cms. S. Hariri is supported by an Agency for International Development (AID) Fellowship. The authors are with the Department of Electrical Engineering-Systems, University of Southern California, Los Angeles, CA 90089. IEEE Log Number 8715443.",
year = "1988",
month = mar,
doi = "10.1109/12.2173",
language = "English (US)",
volume = "37",
pages = "352--358",
journal = "IEEE Transactions on Computers",
issn = "0018-9340",
publisher = "IEEE Computer Society",
number = "3",
}