; Hand this in to: ece849-staff+hw@ece.cmu.edu ; Required Readings @Conference{segall88_fiat, author = "Segall, Z. and Vrsalovic, D. and Siewiorek, D. and Ysskin, D. and Kownacki, J. and Barton, J. and Dancey, R. and Robinson, A.; Lin, T.", title = "FIAT-fault injection based automated testing environment", organization = "FTCS", year = "1988", abstract = "Disputed/parallel real-time dependable systems employed in critical applications such as avionics, air traffic control and nuclear power plants need property validatoin during design, implementation, deployment and maintenance phases. There is a distinct challenge to validate dependabliity properties of such systems. An automated real-time distributed accelerated fault injection environment (FIAT) is presented as an attempt to provide suitable tools for the validation process...", url = "http://ieeexplore.ieee.org/iel2/210/275/00005306.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{ hsueh97fault, author = "Mei-Chen Hsueh and Timothy K. Tsai and Ravishankar K. Iyer", title = "Fault Injection Techniques and Tools", journal = "IEEE Computer", volume = "30", number = "4", pages = "75-82", year = "1997", abstract = "Dependability evaluation involves teh study of failures and errors. The destructive nature of a crash and long error latency make it difficult to identify the causes of failures in teh operational environment. It is particulary hard to recreate a failure scenario for a large, complex system. To identify and understand potential failures, we use an experiment-based approach for studying the dependability of a system. Such an approach is applied not only during the conception and design phases, but also during the prototype and operational phases...", url = "citeseer.nj.nec.com/hsueh97fault.html", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{ madeira02_fi_space, author = "Madeira, H. and Some, R.R. and Moreira, F. and Costa, D. and Rennels, D", title = "Experimental evaluation of a COTS system for space applications", journal = "DSN", year = "2002", abstract = "This paper evaluates the impact of transient errors in the operating system of a COTS-based system (...) and quantifies their effects at both the OS and at the application level. The study has been conducted using a Software-Implemented Fault Injection tool (Xception) and both realistic programs and synthetic workloads (to focus on specific OS features) have been used. The results provide a comprehensive picture of the impact of faults on LynxOS key features ...", url = "http://ieeexplore.ieee.org/iel5/7991/22107/01028916.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @InProceedings{ ademaj03_ttp_fault_injection, author = {A. Ademaj and H. Sivencrona and G. Bauer and J. Torin}, title = {Evaluation of Fault Handling of the Time-Triggered Architecture with Bus and Star Topology}, booktitle = {International Conference on Dependable Systems and Networks}, year = {2003}, address = {San Francisco, CA}, month = {Jun}, url = "http://ieeexplore.ieee.org/iel5/8589/27228/01209924.pdf", abstract = "Arbitrary faults of a single node in a time-triggered architecture (TTA) bus topology system may cause error propagation to correct nodes and may lead to inconsistent system states. This has been observed in validation work using software implemented fault injection (SWIFI) and heavy-ion fault injection techniques in a TTA cluster. In a TTA system, the membership and the clique avoidance algorithms detect state inconsistencies and force the nodes that do not have the same state with the state of majority of nodes, to restart. Changing the interconnection structure of the cluster to a star topology allows the use of star couplers that will isolate faults of a node, thus guaranteeing consistency, even in the presence of arbitrary node failures. The same SWIFI and heavy-ion fault injection experiments that caused error propagation in bus-based TTA clusters, were performed in the star configuration. No error propagation was observed in a TTA system with the star topology during the execution of SWIFI and heavy-ion experiments", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } ; Supplemental Reading @article{aidemark02_brake_fi, author = "Aidemark, J.; Vinter, J.; Folkesson, P.; Karlsson, J.", title = "Experimental evaluation of time-redundant execution for a brake-by-wire application", journal = "DSN", year = "2002", abstract = "This paper presents an experimental evaluation of a brake-by-wire application that tolerates transient faults by temporal error masking. A pecially designed real-time kernel that masks error by triple time-redundant execution and voting executes the application on a fail-stop computer node. The objective is to reduce the number of node failures by masking errors at the computer node level. The real-time kernel always executes the applicatoin twice to detect errors...", url = "http://ieeexplore.ieee.org/iel5/7991/22107/01028902.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @Conference{Arlat89, author = "Arlat, J. and Crouzet, Y. and Laprie, J.-C. ", title = "Fault injection for dependability validation of fault-tolerant computing systems", inbook = "FTCS 19 Digest of Papers. The Nineteenth International Symposium on Fault-Tolerant Computing ", year = "1989", pages = "348-55", abstract = "The authors address the dependability validation of fault-tolerant computing systems and more specifically the validation of the fault-tolerance mechanisms. Their approach is based on the use of fault injection at the physical level on a hardware/software prototype of the system considered. The place of this approach in a validation-directed design process as well as its place with respect to related works on fault injection are identified. The major requirements and problems related to the development and application of a validation methodology based on fault injection are presented and discussed. The proposed methodology has been implemented through the realization of a general physical-fault injection tool (MESSALINE) whose usefulness is demonstrated by its application to the experimental validation of a subsystem of a computerized interlocking system for railway control applications", url = "http://ieeexplore.ieee.org/iel2/251/3238/00105591.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{Carreira98, author = "Carreira, J. and Madeira, H. and Silva, J.G.", title = "Xception: a technique for the experimental evaluation of dependability in modern computers", journal = "IEEE Transactions on Software Engineering 24,", year = "1998", pages = "125-36", number = "2", abstract = "An important step in the development of dependable systems is the validation of their fault tolerance properties. Fault injection has been widely used for this purpose, however with the rapid increase in processor complexity, traditional techniques are also increasingly more difficult to apply. This paper presents a new software-implemented fault injection and monitoring environment, called Xception, which is targeted at modern and complex processors. Xception uses the advanced debugging and performance monitoring features existing in most modern processors to inject quite realistic faults by software, and to monitor the activation of the faults and their impact on the target system behavior in detail. Faults are injected with minimum interference with the target application. The target application is not modified, no software traps are inserted, and it is not necessary to execute the target application in special trace mode (the application is executed at full speed). Xception provides a comprehensive set of fault triggers, including spatial and temporal fault triggers, and triggers related to the manipulation of data in memory. Faults injected by Xception can affect any process running on the target system (including the kernel), and it is possible to inject faults in applications for which the source code is not available. Experimental, results are presented to demonstrate the accuracy and potential of Xception in the evaluation of the dependability", url = "http://ieeexplore.ieee.org/iel4/32/14664/00666826.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @Conference{Christmansson96, author = "Christmansson, J. ; Chillarege, R. ", title = "Generation of an error set that emulates software faults based on field data", inbook = "Proceedings of the Twenty-Sixth International Symposium on Fault-Tolerant Computing. Digest of Papers ", year = "1996", pages = "304-13", abstract = "A significant issue in fault injection experiments is that the injected faults are representative of software faults observed in the field. Another important issue is the time used, as we want experiments to be conducted without excessive time spent waiting for the consequences of a fault. An approach to accelerate the failure process would be to inject errors instead of faults, but this would require a mapping between representative software faults and injectable errors. Furthermore, it must be assured that the injected errors emulate software faults and not hardware faults. These issues were addressed in a study of software faults encountered in one release of a large IBM operating system product. The key results are: A general procedure that uses field data to generate a set of injectable errors, in which each error is defined by: error type, error location and injection condition. The procedure assures that the injected errors emulate software faults and not hardware faults. The faults are uniformly distributed (1.37 fault per module) over the affected modules. The distribution of error categories in the IBM operating system and the distribution of errors in the Tandem Guardian90 operating system reported previously were compared and found to be similar. This result adds a flavor of generality to the field data presented in the current paper", url = "http://ieeexplore.ieee.org/iel3/3791/11109/00534615.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{Kanawati95, author = "Kanawati, G.A. ; Kanawati, N.A. ; Abraham, J.A.", title = "FERRARI: a flexible software-based fault and error injection system", journal = "IEEE Transactions on Computers 44,", year = "1995", pages = "248-60", number = "2", abstract = "A major step toward the development of fault-tolerant computer systems is the validation of the dependability properties of these systems. Fault/error injection has been recognized as a powerful approach to validate the fault tolerance mechanisms of a system and to obtain statistics on parameters such as coverages and latencies. This paper describes the methodology and guidelines for the design of flexible software based fault and error injection and presents a tool, FERRARI, that incorporates the techniques. The techniques used to emulate transient errors and permanent faults in software are described in detail. Experimental results are presented for several error detection techniques, and they demonstrate the effectiveness of the software-based error injection tool in evaluating the dependability properties of complex systems", url = "http://ieeexplore.ieee.org/iel1/12/8353/00364536.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @Conference{Jenn94, author = "Jenn, E. ; Arlat, J. ; Rimen, M. ; Ohlsson, J. ; Karlsson, J. ", title = "Fault injection into VHDL models: the MEFISTO tool", inbook = "Digest of Papers. The Twenty-Fourth International Symposium on Fault-Tolerant Computing ", year = "1994", pages = "66-75", abstract = "This paper focuses on the integration of the fault injection methodology within the design process of fault-tolerant systems. Due to its wide spectrum of application and hierarchical features, VHDL has been selected as the simulation language to support such an integration. Suitable techniques for injecting faults into VHDL models are identified and depicted. Then, the main features of the MEFISTO environment aimed at supporting these techniques are described. Finally, some preliminary results obtained with MEFISTO are presented and analyzed", url = "http://ieeexplore.ieee.org/iel2/951/7613/00315656.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{Voas97, author = "Voas, J.", title = "Fault injection for the masses", journal = "Computer 30,", year = "1997", pages = "129-30", number = "12", abstract = "The key technology that the author would like to see adopted by the masses is a family of software fault injection algorithms that can predict where to concentrate testing. From a novelty standpoint, these algorithms were (and still are) unique among other methods of performing fault injection. The author concedes that the algorithms are computational, but the results can provide unequaled information about how ``bad things'' propagate through systems. Because of that, he thinks fault injection methods are valuable to anyone responsible for software quality, including those working in one-person independent software vendors (ISVs) or even the largest corporations", url = "http://ieeexplore.ieee.org/iel4/2/13915/00642820.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", }