; Hand this in to: ece849-staff+hw@ece.cmu.edu ;Required Readings @Conference{meyer78_performability, author = "Meyer, J.F.", title = "On evaluating the performability of degradable computing systems", organization = "FTCS", year = "1978", abstract = "", url = "http://ieeexplore.ieee.org/iel3/3846/11214/00532628.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @inproceedings{bodson93_simplex, author = "Bodson, M. and Lehoczky, J. and Rajkumar, R. and Sha, L. and Smith, M. and Soh, D. and Stephan, J.", title = "Control reconfiguration in the presence of software failures ", inbook = "Proceedings of the 32nd IEEE Conference on Decision and Control.", year = "1993", volume = "3", pages = "2284-2289", abstract = "In this paper, we discuss a special approach for software fault tolerance in control applications. A full-function, high performance, but complex control systemis complemented by an error=free implementation of a shighly reliable control system of lower functionality. When the correctness of the high performance controller is in doubt, the reliable control system takes over the execution of the task. An innovative feature of the approach is the disparity between the two control systems, which is used to exploit the relative advantages of the simple/reliable vs. complex/high-performance systems. Another innovative feature is the fault detection mechanism, which is based on measures of performance and of safety of the control system. The example of a ball and beam system is used to illustrate the concepts, and experimental results obtained on a laboratory set-up are presented.", url = "http://ieeexplore.ieee.org/iel2/1070/7738/00325604.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @InProceedings{shelton04_alternate_functionality, author = "Shelton, C. and Koopman, P.", title = "Improving System Dependability with Alternative Functionality", inbook = "International Conference on Dependable Systems and Networks", year = "2004", pages = "295 -- 304", url = "http://ieeexplore.ieee.org/iel5/9172/29105/01311899.pdf", abstract = "We present the concept of alternative functionality for improving dependability in distributed embedded systems. Alternative functionality is a mechanism that complements traditional performability and graceful degradation techniques. Rather than providing reduced performance or functionality when components or subsystems fail, alternative functionality replaces a lost feature with another existing system function that can substitute for the lost service. This can provide improved system dependability when it is not feasible to allocate dedicated backup systems for fault tolerance. We show how alternative functionality can be applied to enhance system dependability with a case study of an elevator control system. In simulation, an elevator design that implemented alternative functionality in some of its subsystems tolerated many combinations of component failures that caused system failures in the original design.", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @InProceedings{ strunk05_assured_reconfiguration, author = {Strunk, Elisabeth A. and John C. Knight and M. Anthony Aiello}, title = {Assured Reconfiguration of Fail-Stop Systems}, booktitle = {International Conference on Dependable Systems and Networks}, year = {2005}, address = {Yokohama, Japan}, month = {June}, url = "http://www.cs.virginia.edu/~jck/publications/dsn.2005.assured.reconfiguration.pdf", abstract = "Hardware dependability improvements have led to a situation in which it is sometimes unnecessary to employ extensive hardware replication to mask hardware faults. Expanding upon our previous work on assured reconfiguration for single processes and building upon the fail-stop model of processor behavior, we define a framework that provides assured reconfiguration for concurrent software. This framework can provide high dependability with lower space, power, and weight requirements than systems that replicate hardware to mask all anticipated faults. We base our assurance argument on a proof structure that extends the proofs for the single-application case and includes the fail-stop model of processor behavior. To assess the feasibility of instantiating our framework, we have implemented a hypothetical avionics system that is representative of what might be found on an unmanned aerial vehicle.", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } ; Supplemental Readings @Conference{Adlemo95, author = "Adlemo, A. ; Andreasson, S.-A. ", title = "Improved availability in manufacturing systems through graceful degradation: case study of a machining cell", inbook = "Proceedings of 1995 IEEE International Conference on Robotics and Automation ", year = "1995", pages = "1744-50", volume = "2", abstract = "This paper describes a method using production levels and gracefully degradable manufacturing systems. A production level is a way of grouping certain activities in a manufacturing system. The method, based on graceful degradation, makes use of the production levels to uphold the availability of the system during computer failures or data network failures. The method is demonstrated on a case study of a machining cell", url = "http://ieeexplore.ieee.org/iel3/3951/11430/00525524.pdf", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{Herlihy91, author = "Herlihy, M.P. ; Wing, J.M.", title = "Specifying graceful degradation", journal = "IEEE Transactions on Parallel and Distributed Systems 2,", year = "1991", pages = "93-104", number = "1", abstract = "A description is given of the relaxation lattice method, a new approach to specifying graceful degradation for a large class of programs. A relaxation lattice is a lattice of specifications parameterized by a set of constraints, where the stronger the set of constraints, the more restrictive the specification. While a program is able to satisfy its strongest set of constraints, it satisfies its preferred specification, but if changes to the environment force it to satisfy a weaker set, then it will permit additional weakly consistent computations which are undesired but tolerated. The use of relaxation lattices is illustrated by specifications for programs that tolerate (1) faults, such as site crashes and network partitions, (2) timing anomalies, such as attempting to read a value too soon after it was written, (3) synchronization conflicts, such as choosing the oldest unlocked item from a queue, and (4) security breaches, such as acquiring unauthorized capabilities", url = "http://ieeexplore.ieee.org/iel4/71/2631/00080192.pdf", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @Conference{Knight00, author = "Knight, J.C. ; Sullivan, K.J. ", title = "Towards a definition of survivability", inbook = "ISW 2000. Information Survivability Workshop. Third Information Survivability Workshop - ISW-2000. `Research Directions and Research Collaborations to Protect the Global Information Society'", year = "2000", pages = "85-9", abstract = "Survivability is a new aspect of dependability that arises from the need to operate certain systems: after an event has occurred that damages a system such that the effects of the damage cannot be completely mitigated; and after a sequence of such damaging events has occurred. The paper presents a definition of survivability and relates it to the field of dependability and the technology of fault tolerance", url = "http://citeseer.nj.nec.com/knight00definition.html", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @Conference{Losq77, author = "Losq, J. ", title = "Effects of failures on gracefully degradable systems", inbook = "7th Annual International Conference on Fault-Tolerant Computing", year = "1977", pages = "29-34", abstract = "The recent development of multiprocessor systems that offer resistance to faults by gracefully degrading after a failure opens vast new ranges of application for fault tolerance and high reliability. The paper presents a general model for the evaluation of such systems. It takes into account the internal structure of the hardware, the characteristics of the various detection mechanisms, the unreliability of the software and even the type of applications these systems are used for. It provides many measures of the systems' performance such as: availability, meantime between crashes, average processing power and proportion of time spent in degraded mode. System optimization gives the best values for the number of processors, memories... and shows the trade-offs between hardware and software fault-detection mechanisms. The model is illustrated by an example", url = "http://www.ece.cmu.edu/~ece749/papers/losq77_gracefully_degradable_sys.pdf", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @Conference{Ying-Wah Ng ; Avizienis77, author = "Ying-Wah Ng ; Avizienis, A. ", title = "A reliability model for gracefully degrading and repairable fault-tolerant systems", inbook = "7th Annual International Conference on Fault-Tolerant Computing", year = "1977", pages = "22-8", abstract = "A reliability model for both permanent and transient faults in closed fault-tolerant computer systems has been described previously. The authors report an extension of this model in two directions: the inclusion of individual coverage parameters characterizing every step in the `graceful degradation' procedure which takes place after all spares have been exhausted; and the generalisation of the model to include repairable systems. Both of these extensions are included in the ARIES automated reliability estimation system, which consists of a set of interactive APL programs", url = "http://www.ece.cmu.edu/~ece749/papers/ng77_graceful_degradation.pdf", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{Poledna95, author = "Poledna, S.", title = "Tolerating sensor timing faults in highly responsive hard real-time systems", journal = "IEEE Transactions on Computers 44,", year = "1995", pages = "181-91", number = "2", abstract = "Real-time systems that have to respond to environmental state changes within a very short latency period often use event-triggered task activation. If the system has to function correctly in the presence of sensor faults, event-triggered task activation is not reliable. Faulty sensors may cause task activations to occur too early, too late, or task activations are omitted entirely. In particular, early task activations can overload the system. Time-triggered task activation is reliable, but by defining a competitiveness ratio it is shown that the processor utilization for highly responsive tasks is unacceptably low. To overcome the problems of event-triggered task activation while preserving its good performance the task-splitting model is introduced. The task-splitting model integrates fault tolerance into the analysis and construction of hard real-time systems by using a combination of event-triggered and time-triggered task activation. Based on a general task model, it is independent of any particular scheduling algorithm. The result of this work has influenced the design of a new operating system which will be applied in a robust automotive engine controller of the next generation", url = "http://", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", }