; Hand this in to:   ece849-staff+hw@ece.cmu.edu


;Required Readings

@Conference{meyer78_performability,
  author =	 "Meyer, J.F.",
  title =	 "On evaluating the performability of degradable
                  computing systems",
  organization = "FTCS",
  year =	 "1978",
  abstract =	 "",
  url =
                  "http://ieeexplore.ieee.org/iel3/3846/11214/00532628.pdf",
  studentname =	 "",
  summary =	 "",
  contribution1 ="",
  contribution2 ="",
  contribution3 ="",
  contribution4 ="",
  contribution5 ="",
  weakness1 =	 "",
  weakness2 =	 "",
  weakness3 =	 "",
  weakness4 =	 "",
  weakness5 =	 "",
  interesting =	 "high/med/low",
  opinions =	 "",
}

@inproceedings{bodson93_simplex,
  author =	 "Bodson, M. and Lehoczky, J. and Rajkumar, R. and
                  Sha, L. and Smith, M. and Soh, D. and Stephan, J.",
  title =	 "Control reconfiguration in the presence of software
                  failures ",
  inbook =	 "Proceedings of the 32nd IEEE Conference on Decision
                  and Control.",
  year =	 "1993",
  volume =	 "3",
  pages =	 "2284-2289",
  abstract =	 "In this paper, we discuss a special approach for
                  software fault tolerance in control applications. A
                  full-function, high performance, but complex control
                  systemis complemented by an error=free
                  implementation of a shighly reliable control system
                  of lower functionality. When the correctness of the
                  high performance controller is in doubt, the
                  reliable control system takes over the execution of
                  the task. An innovative feature of the approach is
                  the disparity between the two control systems, which
                  is used to exploit the relative advantages of the
                  simple/reliable vs. complex/high-performance
                  systems. Another innovative feature is the fault
                  detection mechanism, which is based on measures of
                  performance and of safety of the control system. The
                  example of a ball and beam system is used to
                  illustrate the concepts, and experimental results
                  obtained on a laboratory set-up are presented.",
  url =
                  "http://ieeexplore.ieee.org/iel2/1070/7738/00325604.pdf",
  studentname =	 "",
  summary =	 "",
  contribution1 ="",
  contribution2 ="",
  contribution3 ="",
  contribution4 ="",
  contribution5 ="",
  weakness1 =	 "",
  weakness2 =	 "",
  weakness3 =	 "",
  weakness4 =	 "",
  weakness5 =	 "",
  interesting =	 "high/med/low",
  opinions =	 "",
}

@InProceedings{shelton04_alternate_functionality,
  author =	 "Shelton, C. and Koopman, P.",
  title =	 "Improving System Dependability with Alternative
                  Functionality",
  inbook =	 "International Conference on Dependable Systems and
                  Networks",
  year =	 "2004",
  pages =	 "295 -- 304",
  url =
                  "http://ieeexplore.ieee.org/iel5/9172/29105/01311899.pdf",
  abstract =	 "We present the concept of alternative functionality
                  for improving dependability in distributed embedded
                  systems. Alternative functionality is a mechanism
                  that complements traditional performability and
                  graceful degradation techniques. Rather than
                  providing reduced performance or functionality when
                  components or subsystems fail, alternative
                  functionality replaces a lost feature with another
                  existing system function that can substitute for the
                  lost service. This can provide improved system
                  dependability when it is not feasible to allocate
                  dedicated backup systems for fault tolerance. We
                  show how alternative functionality can be applied to
                  enhance system dependability with a case study of an
                  elevator control system. In simulation, an elevator
                  design that implemented alternative functionality in
                  some of its subsystems tolerated many combinations
                  of component failures that caused system failures in
                  the original design.",
  studentname =	 "",
  summary =	 "",
  contribution1 ="",
  contribution2 ="",
  contribution3 ="",
  contribution4 ="",
  contribution5 ="",
  weakness1 =	 "",
  weakness2 =	 "",
  weakness3 =	 "",
  weakness4 =	 "",
  weakness5 =	 "",
  interesting =	 "high/med/low",
  opinions =	 "",
}

@InProceedings{ strunk05_assured_reconfiguration,
  author =	 {Strunk, Elisabeth A. and John C. Knight and
                  M. Anthony Aiello},
  title =	 {Assured Reconfiguration of Fail-Stop Systems},
  booktitle =	 {International Conference on Dependable Systems and
                  Networks},
  year =	 {2005},
  address =	 {Yokohama, Japan},
  month =	 {June},
  url =
                  "http://www.cs.virginia.edu/~jck/publications/dsn.2005.assured.reconfiguration.pdf",
  abstract =	 "Hardware dependability improvements have led to a
                  situation in which it is sometimes unnecessary to
                  employ extensive hardware replication to mask
                  hardware faults. Expanding upon our previous work on
                  assured reconfiguration for single processes and
                  building upon the fail-stop model of processor
                  behavior, we define a framework that provides
                  assured reconfiguration for concurrent
                  software. This framework can provide high
                  dependability with lower space, power, and weight
                  requirements than systems that replicate hardware to
                  mask all anticipated faults. We base our assurance
                  argument on a proof structure that extends the
                  proofs for the single-application case and includes
                  the fail-stop model of processor behavior. To assess
                  the feasibility of instantiating our framework, we
                  have implemented a hypothetical avionics system that
                  is representative of what might be found on an
                  unmanned aerial vehicle.",
  studentname =	 "",
  summary =	 "",
  contribution1 ="",
  contribution2 ="",
  contribution3 ="",
  contribution4 ="",
  contribution5 ="",
  weakness1 =	 "",
  weakness2 =	 "",
  weakness3 =	 "",
  weakness4 =	 "",
  weakness5 =	 "",
  interesting =	 "high/med/low",
  opinions =	 "",
}




; Supplemental Readings

@Conference{Adlemo95,
   author = "Adlemo, A. ; Andreasson, S.-A. ",
   title  = "Improved availability in manufacturing systems through graceful degradation: case study of a machining cell",
   inbook = "Proceedings of 1995 IEEE International Conference on Robotics and Automation ",
   year = "1995",
   pages = "1744-50",
   volume = "2",
   abstract = "This paper describes a method using production levels and gracefully degradable manufacturing systems. A production level is a way of grouping certain activities in a manufacturing system. The method, based on graceful degradation, makes use of the production levels to uphold the availability of the system during computer failures or data network failures. The method is demonstrated on a case study of a machining cell",
   url    = "http://ieeexplore.ieee.org/iel3/3951/11430/00525524.pdf",
   studentname =   "",
   summary =   "",
   contribution1 =   "",
   contribution2 =   "",
   contribution3 =   "",
   contribution4 =   "",
   contribution5 =   "",
   weakness1 =   "",
   weakness2 =   "",
   weakness3 =   "",
   weakness4 =   "",
   weakness5 =   "",
   interesting =   "high/med/low",
   opinions =   "",
}

@article{Herlihy91,
   author = "Herlihy, M.P. ; Wing, J.M.",
   title  = "Specifying graceful degradation",
   journal = "IEEE Transactions on Parallel and Distributed Systems 2,",
   year = "1991",
   pages = "93-104",
   number = "1",
   abstract = "A description is given of the relaxation lattice method, a new approach to specifying graceful degradation for a large class of programs. A relaxation lattice is a lattice of specifications parameterized by a set of constraints, where the stronger the set of constraints, the more restrictive the specification. While a program is able to satisfy its strongest set of constraints, it satisfies its preferred specification, but if changes to the environment force it to satisfy a weaker set, then it will permit additional weakly consistent computations which are undesired but tolerated. The use of relaxation lattices is illustrated by specifications for programs that tolerate (1) faults, such as site crashes and network partitions, (2) timing anomalies, such as attempting to read a value too soon after it was written, (3) synchronization conflicts, such as choosing the oldest unlocked item from a queue, and (4) security breaches, such as acquiring unauthorized capabilities",
   url    = "http://ieeexplore.ieee.org/iel4/71/2631/00080192.pdf",
   studentname =   "",
   summary =   "",
   contribution1 =   "",
   contribution2 =   "",
   contribution3 =   "",
   contribution4 =   "",
   contribution5 =   "",
   weakness1 =   "",
   weakness2 =   "",
   weakness3 =   "",
   weakness4 =   "",
   weakness5 =   "",
   interesting =   "high/med/low",
   opinions =   "",
}

@Conference{Knight00,
   author = "Knight, J.C. ; Sullivan, K.J. ",
   title  = "Towards a definition of survivability",
   inbook = "ISW 2000. Information Survivability Workshop. Third Information Survivability Workshop - ISW-2000. `Research Directions and Research Collaborations to Protect the Global Information Society'",
   year = "2000",
   pages = "85-9",
   abstract = "Survivability is a new aspect of dependability that arises from the need to operate certain systems: after an event has occurred that damages a system such that the effects of the damage cannot be completely mitigated; and after a sequence of such damaging events has occurred. The paper presents a definition of survivability and relates it to the field of dependability and the technology of fault tolerance",
   url    = "http://citeseer.nj.nec.com/knight00definition.html",
   studentname =   "",
   summary =   "",
   contribution1 =   "",
   contribution2 =   "",
   contribution3 =   "",
   contribution4 =   "",
   contribution5 =   "",
   weakness1 =   "",
   weakness2 =   "",
   weakness3 =   "",
   weakness4 =   "",
   weakness5 =   "",
   interesting =   "high/med/low",
   opinions =   "",
}

@Conference{Losq77,
   author = "Losq, J. ",
   title  = "Effects of failures on gracefully degradable systems",
   inbook = "7th Annual International Conference on Fault-Tolerant Computing",
   year = "1977",
   pages = "29-34",
   abstract = "The recent development of multiprocessor systems that offer resistance to faults by gracefully degrading after a failure opens vast new ranges of application for fault tolerance and high reliability. The paper presents a general model for the evaluation of such systems. It takes into account the internal structure of the hardware, the characteristics of the various detection mechanisms, the unreliability of the software and even the type of applications these systems are used for. It provides many measures of the systems' performance such as: availability, meantime between crashes, average processing power and proportion of time spent in degraded mode. System optimization gives the best values for the number of processors, memories... and shows the trade-offs between hardware and software fault-detection mechanisms. The model is illustrated by an example",
   url    = "http://www.ece.cmu.edu/~ece749/papers/losq77_gracefully_degradable_sys.pdf",
   studentname =   "",
   summary =   "",
   contribution1 =   "",
   contribution2 =   "",
   contribution3 =   "",
   contribution4 =   "",
   contribution5 =   "",
   weakness1 =   "",
   weakness2 =   "",
   weakness3 =   "",
   weakness4 =   "",
   weakness5 =   "",
   interesting =   "high/med/low",
   opinions =   "",
}

@Conference{Ying-Wah Ng ; Avizienis77,
   author = "Ying-Wah Ng ; Avizienis, A. ",
   title  = "A reliability model for gracefully degrading and repairable fault-tolerant systems",
   inbook = "7th Annual International Conference on Fault-Tolerant Computing",
   year = "1977",
   pages = "22-8",
   abstract = "A reliability model for both permanent and transient faults in closed fault-tolerant computer systems has been described previously. The authors report an extension of this model in two directions: the inclusion of individual coverage parameters characterizing every step in the `graceful degradation' procedure which takes place after all spares have been exhausted; and the generalisation of the model to include repairable systems. Both of these extensions are included in the ARIES automated reliability estimation system, which consists of a set of interactive APL programs",
   url    = "http://www.ece.cmu.edu/~ece749/papers/ng77_graceful_degradation.pdf",
   studentname =   "",
   summary =   "",
   contribution1 =   "",
   contribution2 =   "",
   contribution3 =   "",
   contribution4 =   "",
   contribution5 =   "",
   weakness1 =   "",
   weakness2 =   "",
   weakness3 =   "",
   weakness4 =   "",
   weakness5 =   "",
   interesting =   "high/med/low",
   opinions =   "",
}

@article{Poledna95,
   author = "Poledna, S.",
   title  = "Tolerating sensor timing faults in highly responsive hard real-time systems",
   journal = "IEEE Transactions on Computers 44,",
   year = "1995",
   pages = "181-91",
   number = "2",
   abstract = "Real-time systems that have to respond to environmental state changes within a very short latency period often use event-triggered task activation. If the system has to function correctly in the presence of sensor faults, event-triggered task activation is not reliable. Faulty sensors may cause task activations to occur too early, too late, or task activations are omitted entirely. In particular, early task activations can overload the system. Time-triggered task activation is reliable, but by defining a competitiveness ratio it is shown that the processor utilization for highly responsive tasks is unacceptably low. To overcome the problems of event-triggered task activation while preserving its good performance the task-splitting model is introduced. The task-splitting model integrates fault tolerance into the analysis and construction of hard real-time systems by using a combination of event-triggered and time-triggered task activation. Based on a general task model, it is independent of any particular scheduling algorithm. The result of this work has influenced the design of a new operating system which will be applied in a robust automotive engine controller of the next generation",
   url    = "http://",
   studentname =   "",
   summary =   "",
   contribution1 =   "",
   contribution2 =   "",
   contribution3 =   "",
   contribution4 =   "",
   contribution5 =   "",
   weakness1 =   "",
   weakness2 =   "",
   weakness3 =   "",
   weakness4 =   "",
   weakness5 =   "",
   interesting =   "high/med/low",
   opinions =   "",
}