# Sprungmarken

 Pinar Tözün and Helena Kotthaus.Scheduling Data-Intensive Tasks on Heterogeneous Many Cores. IEEE Data Engineering Bulletin 42 1, pages 61--72 2019[BibTeX][Link]@article { kotthaus/2019a, author = {T\"oz\"un, Pinar and Kotthaus, Helena}, title = {Scheduling Data-Intensive Tasks on Heterogeneous Many Cores}, journal = {IEEE Data Engineering Bulletin}, year = {2019}, volume = {42}, number = {1}, pages = {61--72}, url = {http://sites.computer.org/debull/A19mar/p61.pdf}, confidential = {n}, } Kuan-Hsun Chen, Georg von der Brüggen and Jian-Jia Chen.Reliability Optimization on Multi-Core Systems with Multi-Tasking and Redundant Multi-Threading. IEEE Transactions on Computers 2018, 10.1109/TC.2017.2769044[BibTeX][Link][Abstract]@article { khchenTC2018, author = {Chen, Kuan-Hsun and Br\"uggen, Georg von der and Chen, Jian-Jia}, title = {Reliability Optimization on Multi-Core Systems with Multi-Tasking and Redundant Multi-Threading}, journal = {IEEE Transactions on Computers}, year = {2018}, note = {10.1109/TC.2017.2769044}, url = {http://ieeexplore.ieee.org/abstract/document/8094023/}, keywords = {kuan, georg}, confidential = {n}, abstract = {Using Redundant Multithreading (RMT) for error detection and recovery is a prominent technique to mitigate soft-error effects in multi-core systems. Simultaneous Redundant Threading (SRT) on the same core or Chip-level Redundant Multithreading (CRT) on different cores can be adopted to implement RMT. However, only a few previously proposed approaches use adaptive CRT managements on the system level and none of them considers both SRT and CRT on the task level. In this paper, we propose to use a combination of SRT and CRT, called Mixed Redundant Threading (MRT), as an additional option on the task level. In our coarse-grained approach, we consider SRT, CRT, and MRT on the system level simultaneously, while the existing results only apply either SRT or CRT on the system level, but not simultaneously. In addition, we consider further fine-grained task level optimizations to improve the system reliability under hard real-time constraints. To optimize the system reliability, we develop several dynamic programming approaches to select the redundancy levels under Federated Scheduling. The simulation results illustrate that our approaches can significantly increase the system reliability compared to the state-of-the-art techniques.}, }Using Redundant Multithreading (RMT) for error detection and recovery is a prominent technique to mitigate soft-error effects in multi-core systems. Simultaneous Redundant Threading (SRT) on the same core or Chip-level Redundant Multithreading (CRT) on different cores can be adopted to implement RMT. However, only a few previously proposed approaches use adaptive CRT managements on the system level and none of them considers both SRT and CRT on the task level. In this paper, we propose to use a combination of SRT and CRT, called Mixed Redundant Threading (MRT), as an additional option on the task level. In our coarse-grained approach, we consider SRT, CRT, and MRT on the system level simultaneously, while the existing results only apply either SRT or CRT on the system level, but not simultaneously. In addition, we consider further fine-grained task level optimizations to improve the system reliability under hard real-time constraints. To optimize the system reliability, we develop several dynamic programming approaches to select the redundancy levels under Federated Scheduling. The simulation results illustrate that our approaches can significantly increase the system reliability compared to the state-of-the-art techniques. Georg von der Brüggen, Jian-Jia Chen, Robert I. Davis and Wen-Hung Kevin Huang.Exact Speedup Factors for Linear-Time Schedulability Tests for Fixed-Priority Preemptive and Non-preemptive Scheduling. Information Processing Letters (IPL) 117, pages 1-5 January 2017[BibTeX][PDF][Link][Abstract]@article { IPL2017-speedup, author = {Br\"uggen, Georg von der and Chen, Jian-Jia and Davis, Robert I. and Huang, Wen-Hung Kevin}, title = {Exact Speedup Factors for Linear-Time Schedulability Tests for Fixed-Priority Preemptive and Non-preemptive Scheduling}, journal = {Information Processing Letters (IPL)}, year = {2017}, volume = {117}, pages = {1-5}, month = {Jan}, url = {http://dx.doi.org/10.1016/j.ipl.2016.08.001}, keywords = {kevin, Georg}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2016-IPL-speedup.pdf}, confidential = {n}, abstract = {In this paper, we investigate the quality of several linear-time schedulability tests for preemptive and non-preemptive fixed-priority scheduling of uniprocessor systems. The metric used to assess the quality of these tests is the resource augmentation bound commonly known as the processor speedup factor. The speedup factor of a schedulability test corresponds to the smallest factor by which the processing speed of a uniprocessor needs to be increased such that any task set that is feasible under an optimal preemptive (non-preemptive) work-conserving scheduling algorithm is guaranteed to be schedulable with preemptive (non-preemptive) fixed priority scheduling if this scheduling test is used, assuming an appropriate priority assignment. We show the surprising result that the exact speedup factors for Deadline Monotonic (DM) priority assignment combined with sufficient linear-time schedulability tests for implicit-, constrained-, and arbitrary-deadline task sets are the same as those obtained for optimal priority assignment policies combined with exact schedulability tests. Thus in terms of the speedup-factors required, there is no penalty in using DM priority assignment and simple linear schedulability tests. }, }In this paper, we investigate the quality of several linear-time schedulability tests for preemptive and non-preemptive fixed-priority scheduling of uniprocessor systems. The metric used to assess the quality of these tests is the resource augmentation bound commonly known as the processor speedup factor. The speedup factor of a schedulability test corresponds to the smallest factor by which the processing speed of a uniprocessor needs to be increased such that any task set that is feasible under an optimal preemptive (non-preemptive) work-conserving scheduling algorithm is guaranteed to be schedulable with preemptive (non-preemptive) fixed priority scheduling if this scheduling test is used, assuming an appropriate priority assignment. We show the surprising result that the exact speedup factors for Deadline Monotonic (DM) priority assignment combined with sufficient linear-time schedulability tests for implicit-, constrained-, and arbitrary-deadline task sets are the same as those obtained for optimal priority assignment policies combined with exact schedulability tests. Thus in terms of the speedup-factors required, there is no penalty in using DM priority assignment and simple linear schedulability tests. Robert I. Davis, Abhilash Thekkilakattil, Oliver Gettings, Radu Dobrin, Sasikumar Punnekkat and Jian-Jia Chen.Exact speedup factors and sub-optimality fornon-preemptive scheduling. Real-Time Systems ?? ??, pages ??-?? ?? 2017, preprint [BibTeX][PDF][Link][Abstract]@article { DavisTGDPC17-RTS, author = {Davis, Robert I. and Thekkilakattil, Abhilash and Gettings, Oliver and Dobrin, Radu and Punnekkat, Sasikumar and Chen, Jian-Jia}, title = {Exact speedup factors and sub-optimality fornon-preemptive scheduling}, journal = {Real-Time Systems}, year = {2017}, volume = {??}, number = {??}, pages = {??-??}, month = {??}, note = {preprint }, url = {http://link.springer.com/article/10.1007/s11241-017-9294-3}, file = {http://link.springer.com/article/10.1007/s11241-017-9294-3}, confidential = {n}, abstract = {Fixed priority scheduling is used in many real-time systems; however,both preemptive and non-preemptive variants (FP-P and FP-NP) are known to besub-optimal when compared to an optimal uniprocessor scheduling algorithm suchas preemptive earliest deadline first (EDF-P). In this paper, we investigate the sub-optimality of fixed priority non-preemptive scheduling. Specifically, we derive theexact processor speed-up factor required to guarantee the feasibility under FP-NP (i.e. schedulability assuming an optimal priority assignment) of any task set that is feasi-ble under EDF-P. As a consequence of this work, we also derive a lower bound onthe sub-optimality of non-preemptive EDF (EDF-NP). As this lower bound matches arecently published upper bound for the same quantity, it closes the exact sub-optimalityfor EDF-NP. It is known that neither preemptive, nor non-preemptive fixed priorityscheduling dominates the other, in other words, there are task sets that are feasibleon a processor of unit speed under FP-P that are not feasible under FP-NP and vice-versa. Hence comparing these two algorithms, there are non-trivial speedup factors inboth directions. We derive the exact speed-up factor required to guarantee the FP-NPfeasibility of any FP-P feasible task set. Further, we derive the exact speed-up factorrequired to guarantee FP-P feasibility of any constrained-deadline FP-NP feasible task set.}, }Fixed priority scheduling is used in many real-time systems; however,both preemptive and non-preemptive variants (FP-P and FP-NP) are known to besub-optimal when compared to an optimal uniprocessor scheduling algorithm suchas preemptive earliest deadline first (EDF-P). In this paper, we investigate the sub-optimality of fixed priority non-preemptive scheduling. Specifically, we derive theexact processor speed-up factor required to guarantee the feasibility under FP-NP (i.e. schedulability assuming an optimal priority assignment) of any task set that is feasi-ble under EDF-P. As a consequence of this work, we also derive a lower bound onthe sub-optimality of non-preemptive EDF (EDF-NP). As this lower bound matches arecently published upper bound for the same quantity, it closes the exact sub-optimalityfor EDF-NP. It is known that neither preemptive, nor non-preemptive fixed priorityscheduling dominates the other, in other words, there are task sets that are feasibleon a processor of unit speed under FP-P that are not feasible under FP-NP and vice-versa. Hence comparing these two algorithms, there are non-trivial speedup factors inboth directions. We derive the exact speed-up factor required to guarantee the FP-NPfeasibility of any FP-P feasible task set. Further, we derive the exact speed-up factorrequired to guarantee FP-P feasibility of any constrained-deadline FP-NP feasible task set. Maolin Yang, Jian-Jia Chen and Wen-Hung Huang.A misconception in blocking time analyses under multiprocessor synchronization protocols. Real-Time Systems 53 2, pages 187--195 2017[BibTeX][Link]@article { DBLP:journals/rts/YangCH17, author = {Yang, Maolin and Chen, Jian-Jia and Huang, Wen-Hung}, title = {A misconception in blocking time analyses under multiprocessor synchronization protocols}, journal = {Real-Time Systems}, year = {2017}, volume = {53}, number = {2}, pages = {187--195}, url = {https://doi.org/10.1007/s11241-016-9261-4}, confidential = {n}, } Santiago Pagani, Anuj Pathania, Muhammad Shafique, Jian-Jia Chen and J{\"{o}}rg Henkel.Energy Efficiency for Clustered Heterogeneous Multicores. {IEEE} Trans. Parallel Distrib. Syst. 28 5, pages 1315--1330 2017[BibTeX][Link]@article { DBLP:journals/tpds/PaganiPSCH17, author = {Pagani, Santiago and Pathania, Anuj and Shafique, Muhammad and Chen, Jian-Jia and Henkel, J{\"{o}}rg}, title = {Energy Efficiency for Clustered Heterogeneous Multicores}, journal = {{IEEE} Trans. Parallel Distrib. Syst.}, year = {2017}, volume = {28}, number = {5}, pages = {1315--1330}, url = {https://doi.org/10.1109/TPDS.2016.2623616}, confidential = {n}, } Santiago Pagani, Heba Khdr, Jian-Jia Chen, Muhammad Shafique, Minming Li and J{\"{o}}rg Henkel.Thermal Safe Power {(TSP):} Efficient Power Budgeting for Heterogeneous Manycore Systems in Dark Silicon. {IEEE} Trans. Computers 66 1, pages 147--162 2017[BibTeX][Link]@article { DBLP:journals/tc/PaganiKCSLH17, author = {Pagani, Santiago and Khdr, Heba and Chen, Jian-Jia and Shafique, Muhammad and Li, Minming and Henkel, J{\"{o}}rg}, title = {Thermal Safe Power {(TSP):} Efficient Power Budgeting for Heterogeneous Manycore Systems in Dark Silicon}, journal = {{IEEE} Trans. Computers}, year = {2017}, bdsk-url-1 = {http://doi.ieeecomputersociety.org/10.1109/TC.2016.2564969}, bdsk-url-2 = {http://dx.doi.org/10.1109/TC.2016.2564969}, volume = {66}, number = {1}, pages = {147--162}, url = {http://doi.ieeecomputersociety.org/10.1109/TC.2016.2564969}, confidential = {n}, } S. Rehman, Kuan-Hsun Chen, F. Kriebel, A. Toma, M. Shafique, Jian-Jia Chen and J. Henkel.Cross-Layer Software Dependability on Unreliable Hardware. Computers, IEEE Transactions on 65 1, pages 80-94 2016[BibTeX][PDF][Abstract]@article { 7070723, author = {Rehman, S. and Chen, Kuan-Hsun and Kriebel, F. and Toma, A. and Shafique, M. and Chen, Jian-Jia and Henkel, J.}, title = {Cross-Layer Software Dependability on Unreliable Hardware}, journal = {Computers, IEEE Transactions on}, year = {2016}, volume = {65}, number = {1}, pages = {80-94}, keywords = {kuan}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2015-semeen.pdf }, confidential = {n}, abstract = {To enable reliable embedded systems, it is imperative to leverage the compiler and system software for joint optimization of functional correctness (i.e., vulnerability indexes) and timing correctness (i.e., deadline misses). This paper considers the optimization of the Reliability-Timing (RT) penalty, defined as a linear combination of the vulnerability and deadline misses. We propose a cross-layer approach to achieve reliable code generation and execution at compilation and system software layers for embedded systems. This is enabled by the concept of generating multiple versions for given application functions, with diverse performance and reliability tradeoffs, by exploiting different reliability-guided compilation options. As the execution time of a function is not fixed, the selection of the versions depends upon the execution behavior of the previous functions. Based on the reliability and execution time profiling of these versions, our reliability-driven system software decides the prioritization of the functions for determining their execution order and employs dynamic version selection to dynamically select a suitable version of a function. Specifically, our scheme builds a schedule table offline to optimize the RT penalty, and uses this table at run time to select suitable versions for the subsequent functions. A complex real-world application of “secure video and audio processing” composed of various functions is evaluated for reliable code generation and execution. }, }To enable reliable embedded systems, it is imperative to leverage the compiler and system software for joint optimization of functional correctness (i.e., vulnerability indexes) and timing correctness (i.e., deadline misses). This paper considers the optimization of the Reliability-Timing (RT) penalty, defined as a linear combination of the vulnerability and deadline misses. We propose a cross-layer approach to achieve reliable code generation and execution at compilation and system software layers for embedded systems. This is enabled by the concept of generating multiple versions for given application functions, with diverse performance and reliability tradeoffs, by exploiting different reliability-guided compilation options. As the execution time of a function is not fixed, the selection of the versions depends upon the execution behavior of the previous functions. Based on the reliability and execution time profiling of these versions, our reliability-driven system software decides the prioritization of the functions for determining their execution order and employs dynamic version selection to dynamically select a suitable version of a function. Specifically, our scheme builds a schedule table offline to optimize the RT penalty, and uses this table at run time to select suitable versions for the subsequent functions. A complex real-world application of “secure video and audio processing” composed of various functions is evaluated for reliable code generation and execution. Kuan-Hsun Chen, Jian-Jia Chen, F. Kriebel, S. Rehman, M. Shafique and J. Henkel.Task Mapping for Redundant Multithreading in Multi-Cores with Reliability and Performance Heterogeneity. Computers, IEEE Transactions on 2016[BibTeX][PDF][Link][Abstract]@article { TC2016Kuan, author = {Chen, Kuan-Hsun and Chen, Jian-Jia and Kriebel, F. and Rehman, S. and Shafique, M. and Henkel, J.}, title = {Task Mapping for Redundant Multithreading in Multi-Cores with Reliability and Performance Heterogeneity}, journal = {Computers, IEEE Transactions on}, year = {2016}, url = {http://ieeexplore.ieee.org/document/7422036/}, keywords = {kuan}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2016-kuan-taskmapping.pdf}, confidential = {n}, abstract = {Abstract—Due to the architectural design, process variations and aging, individual cores in many-core systems exhibit heterogeneous performance. In many-core systems, a commonly adopted soft error mitigation technique is Redundant Multithreading (RMT) that achieves error detection and recovery through redundant thread execution on different cores for an application. However, task mapping and the task execution mode (i.e. whether a task executes in a reliable mode with RMT or unreliable mode without RMT) need to be considered for achieving resource-efficient reliability. This paper explores how to efficiently assign the tasks onto different cores with heterogeneous performance properties and determine the execution modes of tasks in order to achieve high reliability and satisfy the tolerance of timeliness. We demonstrate that the task mapping problem under heterogeneous performance can be solved by employing Hungarian Algorithm as subroutine to efficiently assign the tasks onto the cores to optimize the system reliability with polynomial time complexity. To obtain the efficient task execution modes, we also propose an iterative mode adaptation technique and guarantee the tolerable timing constraint. Our results illustrate that compared to state-of-the-art, the proposed approaches achieve up to 80% reliability improvement (on average 20%) under different scenarios of chip frequency variation maps. }, }Abstract—Due to the architectural design, process variations and aging, individual cores in many-core systems exhibit heterogeneous performance. In many-core systems, a commonly adopted soft error mitigation technique is Redundant Multithreading (RMT) that achieves error detection and recovery through redundant thread execution on different cores for an application. However, task mapping and the task execution mode (i.e. whether a task executes in a reliable mode with RMT or unreliable mode without RMT) need to be considered for achieving resource-efficient reliability. This paper explores how to efficiently assign the tasks onto different cores with heterogeneous performance properties and determine the execution modes of tasks in order to achieve high reliability and satisfy the tolerance of timeliness. We demonstrate that the task mapping problem under heterogeneous performance can be solved by employing Hungarian Algorithm as subroutine to efficiently assign the tasks onto the cores to optimize the system reliability with polynomial time complexity. To obtain the efficient task execution modes, we also propose an iterative mode adaptation technique and guarantee the tolerable timing constraint. Our results illustrate that compared to state-of-the-art, the proposed approaches achieve up to 80% reliability improvement (on average 20%) under different scenarios of chip frequency variation maps. Olaf Neugebauer, Michael Engel and Peter Marwedel.A parallelization approach for resource-restricted embedded heterogeneous MPSoCs inspired by OpenMP. Journal of Systems and Software 125 C, pages 439-448 March 2016[BibTeX][Link][Abstract]@article { Neugebauer2016, author = {Neugebauer, Olaf and Engel, Michael and Marwedel, Peter}, title = {A parallelization approach for resource-restricted embedded heterogeneous MPSoCs inspired by OpenMP}, journal = {Journal of Systems and Software}, year = {2016}, volume = {125}, number = {C}, pages = {439-448}, month = {March}, url = {http://www.sciencedirect.com/science/article/pii/S0164121216301534}, keywords = {Embedded systems}, confidential = {n}, abstract = {Abstract Future low-end embedded systems will make an increased use of heterogeneous MPSoCs. To utilize these systems efficiently, methods and tools are required that support the extraction and implementation of parallelism typically found in embedded applications. Ideally, large amounts of existing legacy code should be reused and ported to these new systems. Existing parallelization infrastructures, however, mostly support parallelization according to the requirements of \{HPEC\} systems. For resource-restricted embedded systems, different parallelization strategies are necessary to achieve additional non-functional objectives such as the reduction of energy consumption. HPC-focused parallelization also assumes processor, memory and communication structures different from low-end embedded systems and therefore wastes optimization opportunities essential for improving the performance of resource-constrained embedded systems. This paper describes a new approach and infrastructure inspired by the OpenMP \{API\} to support the extraction and implementation of pipeline parallelism, which is commonly found in complex embedded applications. In addition, advanced techniques to extract parallelism from legacy applications requiring only minimal code modifications are presented. Further, the resulting toolflow combines advanced parallelization, mapping and communication optimization tools leading to a more efficient approach to exploit parallelism for typical embedded applications on heterogeneous \{MPSoCs\} running distributed real-time operating systems.}, }Abstract Future low-end embedded systems will make an increased use of heterogeneous MPSoCs. To utilize these systems efficiently, methods and tools are required that support the extraction and implementation of parallelism typically found in embedded applications. Ideally, large amounts of existing legacy code should be reused and ported to these new systems. Existing parallelization infrastructures, however, mostly support parallelization according to the requirements of \{HPEC\} systems. For resource-restricted embedded systems, different parallelization strategies are necessary to achieve additional non-functional objectives such as the reduction of energy consumption. HPC-focused parallelization also assumes processor, memory and communication structures different from low-end embedded systems and therefore wastes optimization opportunities essential for improving the performance of resource-constrained embedded systems. This paper describes a new approach and infrastructure inspired by the OpenMP \{API\} to support the extraction and implementation of pipeline parallelism, which is commonly found in complex embedded applications. In addition, advanced techniques to extract parallelism from legacy applications requiring only minimal code modifications are presented. Further, the resulting toolflow combines advanced parallelization, mapping and communication optimization tools leading to a more efficient approach to exploit parallelism for typical embedded applications on heterogeneous \{MPSoCs\} running distributed real-time operating systems. Jian-Jia Chen.Federated Scheduling Admits No Constant Speedup Factors for Constrained-Deadline DAG Task Systems. Real-Time Systems 52 6, pages 833–838 2016, An earlier version is filed in arXiv: https://arxiv.org/abs/1510.07254 [BibTeX][PDF][Link][Abstract]@article { Chen:RTS2016-Federated, author = {Chen, Jian-Jia}, title = {Federated Scheduling Admits No Constant Speedup Factors for Constrained-Deadline DAG Task Systems}, journal = {Real-Time Systems}, year = {2016}, volume = {52}, number = {6}, pages = {833–838}, note = { An earlier version is filed in arXiv: https://arxiv.org/abs/1510.07254 }, url = {http://doi.org/10.1007/s11241-016-9255-2}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2016-chen-RTS-federated.pdf}, confidential = {n}, abstract = { In the federated scheduling approaches in multiprocessor systems, a task either 1) is restricted to execute sequentially on a single processor or 2) has exclusive access to the assigned processors. There have been several positive results to conduct good federated scheduling policies, which have constant speedup factors with respect to any optimal federated scheduling algorithm. This paper answers an open question: For constrained-deadline task systems with directed acyclic graph (DAG) dependency structures, do federated scheduling policies have a constant speedup factor with respect to any optimal scheduling algorithm?'' The answer is No!'' This paper presents an example, which demonstrates that any federated scheduling algorithm has a speedup factor of at least $\Omega(\min\{M, N\})$ with respect to any optimal scheduling algorithm, where $N$ is the number of tasks and $M$ is the number of processors.}, } In the federated scheduling approaches in multiprocessor systems, a task either 1) is restricted to execute sequentially on a single processor or 2) has exclusive access to the assigned processors. There have been several positive results to conduct good federated scheduling policies, which have constant speedup factors with respect to any optimal federated scheduling algorithm. This paper answers an open question: For constrained-deadline task systems with directed acyclic graph (DAG) dependency structures, do federated scheduling policies have a constant speedup factor with respect to any optimal scheduling algorithm?'' The answer is "No!" This paper presents an example, which demonstrates that any federated scheduling algorithm has a speedup factor of at least $\Omega(\min\{M, N\})$ with respect to any optimal scheduling algorithm, where $N$ is the number of tasks and $M$ is the number of processors. Jianjun Li, Jian-Jia Chen, Ming Xiong, Guohui Li and Wei Wei.Temporal Consistency Maintenance Upon Partitioned Multiprocessor Platforms. {IEEE} Trans. Computers 65 5, pages 1632--1645 2016[BibTeX][Link]@article { DBLP:journals/tc/LiCXLW16, author = {Li, Jianjun and Chen, Jian-Jia and Xiong, Ming and Li, Guohui and Wei, Wei}, title = {Temporal Consistency Maintenance Upon Partitioned Multiprocessor Platforms}, journal = {{IEEE} Trans. Computers}, year = {2016}, volume = {65}, number = {5}, pages = {1632--1645}, url = {https://doi.org/10.1109/TC.2015.2448088}, confidential = {n}, } Jian-Jia Chen and Björn Brandenburg.A Note on the Period Enforcer Algorithm for Self-Suspending Tasks. Leibniz Transactions on Embedded Systems (LITES) 4 1, pages 01:1-01:22 2016[BibTeX][PDF][Abstract]@article { ChenBrandenburg-2016-LITES, author = {Chen, Jian-Jia and Brandenburg, Bj\"orn}, title = {A Note on the Period Enforcer Algorithm for Self-Suspending Tasks}, journal = {Leibniz Transactions on Embedded Systems (LITES)}, year = {2016}, volume = {4}, number = {1}, pages = {01:1-01:22}, file = {http://ojs.dagstuhl.de/index.php/lites/article/download/LITES-v004-i001-a001/lites-v004-i001-a001-pdf}, confidential = {n}, abstract = { In general computing systems, a job (process/task) may suspend itself whilst it is waiting for some activity to complete, \eg, an accelerator to return required data or results from the offloaded computation. For real-time embedded systems, such self-suspension can cause substantial performance/schedulability degradation. This has led to the investigation of the impact of self-suspension behaviour on timing predictability, with many results reported since 1990. This paper reviews the design and analysis of scheduling algorithms and schedulability tests for self-suspending tasks in real-time systems. We report that a number of these existing approaches are flawed. As a result, we provide (1) a systematic description of how self-suspending tasks can be handled in both soft and hard real-time systems; (2) an explanation of the existing misconceptions and their potential remedies; (3) an assessment of the influence of such flawed analysis on partitioned multiprocessor fixed-priority scheduling when tasks synchronize access to shared resources; and (4) a computational complexity analysis for different self-suspension task models. In summary, this paper provides a state-of-art review of existing real-time analysis of self-suspending tasks to provide a correct platform on which future research can be built. }, } In general computing systems, a job (process/task) may suspend itself whilst it is waiting for some activity to complete, \eg, an accelerator to return required data or results from the offloaded computation. For real-time embedded systems, such self-suspension can cause substantial performance/schedulability degradation. This has led to the investigation of the impact of self-suspension behaviour on timing predictability, with many results reported since 1990. This paper reviews the design and analysis of scheduling algorithms and schedulability tests for self-suspending tasks in real-time systems. We report that a number of these existing approaches are flawed. As a result, we provide (1) a systematic description of how self-suspending tasks can be handled in both soft and hard real-time systems; (2) an explanation of the existing misconceptions and their potential remedies; (3) an assessment of the influence of such flawed analysis on partitioned multiprocessor fixed-priority scheduling when tasks synchronize access to shared resources; and (4) a computational complexity analysis for different self-suspension task models. In summary, this paper provides a state-of-art review of existing real-time analysis of self-suspending tasks to provide a correct platform on which future research can be built. Sheng-Wei Cheng, Che-Wei Chang, Jian-Jia Chen, Tei-Wei Kuo and Pi-Cheng Hsiu.Many-Core Real-Time Task Scheduling with Scratchpad Memory. {IEEE} Trans. Parallel Distrib. Syst. 27 10, pages 2953--2966 2016[BibTeX][Link]@article { DBLP:journals/tpds/ChengCCKH16, author = {Cheng, Sheng-Wei and Chang, Che-Wei and Chen, Jian-Jia and Kuo, Tei-Wei and Hsiu, Pi-Cheng}, title = {Many-Core Real-Time Task Scheduling with Scratchpad Memory}, journal = {{IEEE} Trans. Parallel Distrib. Syst.}, year = {2016}, volume = {27}, number = {10}, pages = {2953--2966}, url = {https://doi.org/10.1109/TPDS.2016.2516519}, confidential = {n}, } Victoria Shpacovitch, Vladimir Temchura, Mikhail Matrosovich, Joachim Hamacher, Julia Skolnik, Pascal Libuschewski, Dominic Siedhoff, Frank Weichert, Peter Marwedel, Heinrich Müller, Klaus \"Uberla, Roland Hergenröder and Alexander Zybin.Application of Surface Plasmon Resonance Imaging Technique for the Detection of Single Spherical Biological Submicron-particles. Analytical Biochemistry: Methods in the Biological Sciences 2015, Accepted for publication[BibTeX]@article { Shpacovitch/etal/2015a, author = {Shpacovitch, Victoria and Temchura, Vladimir and Matrosovich, Mikhail and Hamacher, Joachim and Skolnik, Julia and Libuschewski, Pascal and Siedhoff, Dominic and Weichert, Frank and Marwedel, Peter and M\"uller, Heinrich and \"Uberla, Klaus and Hergenr\"oder, Roland and Zybin, Alexander}, title = {Application of Surface Plasmon Resonance Imaging Technique for the Detection of Single Spherical Biological Submicron-particles}, journal = {Analytical Biochemistry: Methods in the Biological Sciences}, year = {2015}, note = {Accepted for publication}, confidential = {n}, } Jian{-}Jia Chen, Mong{-}Jen Kao, D. T. Lee, Ignaz Rutter and Dorothea Wagner.Online dynamic power management with hard real-time guarantees. Theor. Comput. Sci. 595, pages 46--64 2015[BibTeX][Link][Abstract]@article { DBLP:journals/tcs/ChenKLRW15, author = {Chen, Jian{-}Jia and Kao, Mong{-}Jen and Lee, D. T. and Rutter, Ignaz and Wagner, Dorothea}, title = {Online dynamic power management with hard real-time guarantees}, journal = {Theor. Comput. Sci.}, year = {2015}, volume = {595}, pages = {46--64}, url = {http://dx.doi.org/10.1016/j.tcs.2015.06.014}, confidential = {n}, abstract = {We consider the problem of online dynamic power management that provides hard real-time guarantees for multi-processor systems. In this problem, a set of jobs, each associated with an arrival time, a deadline, and an execution time, arrives to the system in an online fashion. The objective is to compute a non-migrative preemptive schedule of the jobs and a sequence of power on/off operations of the processors so as to minimize the total energy consumption while ensuring that all the deadlines of the jobs are met. We assume that we can use as many processors as necessary. In this paper we examine the complexity of this problem and provide online strategies that lead to practical energy-efficient solutions for real-time multi-processor systems. First, we consider the case for which we know in advance that the set of jobs can be scheduled feasibly on a single processor. We show that, even in this case, the competitive ratio of any online algorithm is at least 2.06. On the other hand, we give a 4-competitive online algorithm that uses at most two processors. For jobs with unit execution times, the competitive ratio of this algorithm improves to 3.59. Second, we relax our assumption by considering as input multiple streams of jobs, each of which can be scheduled feasibly on a single processor. We present a trade-off between the energy-efficiency of the schedule and the number of processors to be used. More specifically, for k given job streams and h processors with h>kh>k, we give a scheduling strategy such that the energy usage is at most View the MathML source4⋅⌈kh−k⌉ times that used by any schedule which schedules each of the k streams on a separate processor. Finally, we drop the assumptions on the input set of jobs. We show that the competitive ratio of any online algorithm is at least 2.28, even for the case of unit job execution times for which we further derive an O(1)O(1)-competitive algorithm.}, }We consider the problem of online dynamic power management that provides hard real-time guarantees for multi-processor systems. In this problem, a set of jobs, each associated with an arrival time, a deadline, and an execution time, arrives to the system in an online fashion. The objective is to compute a non-migrative preemptive schedule of the jobs and a sequence of power on/off operations of the processors so as to minimize the total energy consumption while ensuring that all the deadlines of the jobs are met. We assume that we can use as many processors as necessary. In this paper we examine the complexity of this problem and provide online strategies that lead to practical energy-efficient solutions for real-time multi-processor systems. First, we consider the case for which we know in advance that the set of jobs can be scheduled feasibly on a single processor. We show that, even in this case, the competitive ratio of any online algorithm is at least 2.06. On the other hand, we give a 4-competitive online algorithm that uses at most two processors. For jobs with unit execution times, the competitive ratio of this algorithm improves to 3.59. Second, we relax our assumption by considering as input multiple streams of jobs, each of which can be scheduled feasibly on a single processor. We present a trade-off between the energy-efficiency of the schedule and the number of processors to be used. More specifically, for k given job streams and h processors with h>kh>k, we give a scheduling strategy such that the energy usage is at most View the MathML source4⋅⌈kh−k⌉ times that used by any schedule which schedules each of the k streams on a separate processor. Finally, we drop the assumptions on the input set of jobs. We show that the competitive ratio of any online algorithm is at least 2.28, even for the case of unit job execution times for which we further derive an O(1)O(1)-competitive algorithm. Santiago Pagani, Jian-Jia Chen and J{\"{o}}rg Henkel.Energy and Peak Power Efficiency Analysis for the Single Voltage Approximation {(SVA)} Scheme. {IEEE} Trans. on {CAD} of Integrated Circuits and Systems 34 9, pages 1415--1428 2015[BibTeX][Link]@article { DBLP:journals/tcad/PaganiCH15, author = {Pagani, Santiago and Chen, Jian-Jia and Henkel, J{\"{o}}rg}, title = {Energy and Peak Power Efficiency Analysis for the Single Voltage Approximation {(SVA)} Scheme}, journal = {{IEEE} Trans. on {CAD} of Integrated Circuits and Systems}, year = {2015}, volume = {34}, number = {9}, pages = {1415--1428}, url = {https://doi.org/10.1109/TCAD.2015.2406862}, confidential = {n}, } Janmartin Jahn, Santiago Pagani, Sebastian Kobbe, Jian-Jia Chen and Jörg Henkel.Runtime Resource Allocation for Software Pipelines. TOPC 2 1, pages 5 2015[BibTeX][PDF][Link][Abstract]@article { DBLP:journals/topc/JahnPKCH15, author = {Jahn, Janmartin and Pagani, Santiago and Kobbe, Sebastian and Chen, Jian-Jia and Henkel, J\"org}, title = {Runtime Resource Allocation for Software Pipelines}, journal = {TOPC}, year = {2015}, volume = {2}, number = {1}, pages = {5}, url = {http://doi.acm.org/10.1145/2742347}, file = {http://cesweb.itec.kit.edu/~pagani/pubs/Jahn-TOPC-2015-Software_pipelines.pdf}, confidential = {n}, abstract = {Efficiently allocating the computational resources of many-core systems is one of the most prominent challenges, especially when resource requirements may vary unpredictably at runtime. This is even more challenging when facing unreliable cores—a scenario that becomes common as the number of cores increases and integration sizes shrink. To address this challenge, this article presents an optimal method for the allocation of the resources to software-pipelined applications. Here we show how runtime observations of the resource requirements of tasks can be used to adapt resource allocations. Furthermore, we show how the optimum can be traded for a high degree of scalability by clustering applications in a distributed, hierarchical manner. To diminish the negative effects of unreliable cores, this article shows how self-organization can effectively restore the integrity of such a hierarchy when it is corrupted by a failing core. Experiments on Intel’s 48-core Single-Chip Cloud Computer and in a many-core simulator show that a significant improvement in system throughput can be achieved over the current state of the art. }, }Efficiently allocating the computational resources of many-core systems is one of the most prominent challenges, especially when resource requirements may vary unpredictably at runtime. This is even more challenging when facing unreliable cores—a scenario that becomes common as the number of cores increases and integration sizes shrink. To address this challenge, this article presents an optimal method for the allocation of the resources to software-pipelined applications. Here we show how runtime observations of the resource requirements of tasks can be used to adapt resource allocations. Furthermore, we show how the optimum can be traded for a high degree of scalability by clustering applications in a distributed, hierarchical manner. To diminish the negative effects of unreliable cores, this article shows how self-organization can effectively restore the integrity of such a hierarchy when it is corrupted by a failing core. Experiments on Intel’s 48-core Single-Chip Cloud Computer and in a many-core simulator show that a significant improvement in system throughput can be achieved over the current state of the art. Che-Wei Chang, Jian-Jia Chen, Tei-Wei Kuo and Heiko Falk.Real-Time Task Scheduling on Island-Based Multi-Core Platforms. {IEEE} Trans. Parallel Distrib. Syst. 26 2, pages 538--550 2015[BibTeX][PDF][Link][Abstract]@article { DBLP:journals/tpds/ChangCKF15, author = {Chang, Che-Wei and Chen, Jian-Jia and Kuo, Tei-Wei and Falk, Heiko}, title = {Real-Time Task Scheduling on Island-Based Multi-Core Platforms}, journal = {{IEEE} Trans. Parallel Distrib. Syst.}, year = {2015}, volume = {26}, number = {2}, pages = {538--550}, url = {http://dx.doi.org/10.1109/TPDS.2013.2297308}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2015-chen-IslandMultiCore_TPDS.pdf}, confidential = {n}, abstract = {With the increasing number of cores in a computing system, how to coordinate the computing units and heterogeneous memory resources has soon become extremely critical for real-time systems. This paper explores the joint considerations of memory management and real-time task scheduling over island-based multi-core architecture, where the local memory module of an island offers shorter access time than the global memory module does. The objective of this work is to minimize the number of needed islands to successfully schedule real-time tasks. When the required amount of the local memory space is specified for each task, a scheduling algorithm is proposed to provide an asymptotic 299-approximation bound. When there is flexibility in determining the needed local memory space for each task, we propose an algorithm with an asymptotic 4-approximation bound to jointly manage memory resources and allocate computing cores. In addition to the worst-case approximation analysis, the proposed algorithms are also evaluated with 82 real-life benchmarks with the support of a worst-case execution time analyzer. Moreover, extensive evaluations are conducted to show the capability of the proposed approaches when being used with various computing cores and memory resources. }, }With the increasing number of cores in a computing system, how to coordinate the computing units and heterogeneous memory resources has soon become extremely critical for real-time systems. This paper explores the joint considerations of memory management and real-time task scheduling over island-based multi-core architecture, where the local memory module of an island offers shorter access time than the global memory module does. The objective of this work is to minimize the number of needed islands to successfully schedule real-time tasks. When the required amount of the local memory space is specified for each task, a scheduling algorithm is proposed to provide an asymptotic 299-approximation bound. When there is flexibility in determining the needed local memory space for each task, we propose an algorithm with an asymptotic 4-approximation bound to jointly manage memory resources and allocate computing cores. In addition to the worst-case approximation analysis, the proposed algorithms are also evaluated with 82 real-life benchmarks with the support of a worst-case execution time analyzer. Moreover, extensive evaluations are conducted to show the capability of the proposed approaches when being used with various computing cores and memory resources. Santiago Pagani, Jian-Jia Chen and Minming Li.Energy Efficiency on Multi-Core Architectures with Multiple Voltage Islands. IEEE Trans. Parallel Distrib. Syst. 26 6, pages 1608--1621 2015[BibTeX][PDF][Link][Abstract]@article { DBLP:journals/tpds/PaganiCL15, author = {Pagani, Santiago and Chen, Jian-Jia and Li, Minming}, title = {Energy Efficiency on Multi-Core Architectures with Multiple Voltage Islands}, journal = {IEEE Trans. Parallel Distrib. Syst.}, year = {2015}, volume = {26}, number = {6}, pages = {1608--1621}, url = {http://doi.ieeecomputersociety.org/10.1109/TPDS.2014.2323260}, file = {http://cesweb.itec.kit.edu/~pagani/pubs/Pagani-TPDS-2014-DYVIA.pdf}, confidential = {n}, abstract = {Efficient and effective system-level power management for multi-core systems with multiple voltage islands is necessary for next-generation computing systems. This paper considers energy efficiency for such systems, in which the cores in the same voltage island have to be operated at the same supply voltage level. We explore how to map given task sets onto cores, so that each task set is assigned and executed on one core and the energy consumption is minimized. Due to the restriction to operate at the same supply voltage in a voltage island, different mappings will result in different energy consumptions. By using the simple single frequency approximation scheme (SFA) to decide the voltages and frequencies of individual voltage islands, this paper presents the approximation factor analysis (in terms of energy consumption) for simple heuristic algorithms, and develops a dynamic programming algorithm, which derives optimal mapping solutions for energy minimization when using SFA. We experimentally evaluate the running time and energy consumption performance of these algorithms on Intel's single-chip cloud computer (SCC). Moreover, we conduct simulations for hypothetical platforms with different number of voltage islands and cores per island, also considering different task partitioning policies.}, }Efficient and effective system-level power management for multi-core systems with multiple voltage islands is necessary for next-generation computing systems. This paper considers energy efficiency for such systems, in which the cores in the same voltage island have to be operated at the same supply voltage level. We explore how to map given task sets onto cores, so that each task set is assigned and executed on one core and the energy consumption is minimized. Due to the restriction to operate at the same supply voltage in a voltage island, different mappings will result in different energy consumptions. By using the simple single frequency approximation scheme (SFA) to decide the voltages and frequencies of individual voltage islands, this paper presents the approximation factor analysis (in terms of energy consumption) for simple heuristic algorithms, and develops a dynamic programming algorithm, which derives optimal mapping solutions for energy minimization when using SFA. We experimentally evaluate the running time and energy consumption performance of these algorithms on Intel's single-chip cloud computer (SCC). Moreover, we conduct simulations for hypothetical platforms with different number of voltage islands and cores per island, also considering different task partitioning policies. Inad Aljarrah, Anas Toma and Mohammad Al-Rousan.An Automatic Intelligent System for Diagnosis and Confirmation of Johne's Disease. International Journal of Intelligent Systems Technologies and Applications 14 2, pages 128--144 January 2015[BibTeX][PDF][Abstract]@article { Aljarrah:2015:AIS:2876804.2876807, author = {Aljarrah, Inad and Toma, Anas and Al-Rousan, Mohammad}, title = {An Automatic Intelligent System for Diagnosis and Confirmation of Johne's Disease}, journal = {International Journal of Intelligent Systems Technologies and Applications}, year = {2015}, volume = {14}, number = {2}, pages = {128--144}, month = {January}, keywords = {http://dx.doi.org/10.1504/IJISTA.2015.074072}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2015-toma-ijista.pdf}, confidential = {n}, abstract = {Johne's disease is one of the most widespread bacterial diseases of domestic animals. It causes yearly losses of billions of dollars worldwide. In this paper an automatic intelligent computer-aided system is proposed for the diagnosis of Johne's disease, the system uses image analysis and computer vision techniques to extract features from two different microscopic images, then those features are classified using neural networks and K-nearest neighbour K-NN techniques to diagnose Johne's disease. The proposed system employs histopathological examination to extract 192 different texture features. The features are then reduced into only 8 features and classified using artificial neural networks ANN. The acid fast stain test is used to confirm the positive cases. The construction and testing of both models are carried out using a total of 294 microscopic images, 194 images for the histopathological examination test which produces an overall accuracy of 98.33%. The other 100 images are used for the acid fast stain test, and it achieves an accuracy of 96.97%.}, }Johne's disease is one of the most widespread bacterial diseases of domestic animals. It causes yearly losses of billions of dollars worldwide. In this paper an automatic intelligent computer-aided system is proposed for the diagnosis of Johne's disease, the system uses image analysis and computer vision techniques to extract features from two different microscopic images, then those features are classified using neural networks and K-nearest neighbour K-NN techniques to diagnose Johne's disease. The proposed system employs histopathological examination to extract 192 different texture features. The features are then reduced into only 8 features and classified using artificial neural networks ANN. The acid fast stain test is used to confirm the positive cases. The construction and testing of both models are carried out using a total of 294 microscopic images, 194 images for the histopathological examination test which produces an overall accuracy of 98.33%. The other 100 images are used for the acid fast stain test, and it achieves an accuracy of 96.97%. Muhammad Shafique, Philip Axer, Christoph Borchert, Jian-Jia Chen, Kuan-Hsun Chen, Bj{\"{o}}rn D{\"{o}}bel, Rolf Ernst, Hermann H{\"{a}}rtig, Andreas Heinig, R{\"{u}}diger Kapitza, Florian Kriebel, Daniel Lohmann, Peter Marwedel, Semeen Rehman, Florian Schmoll and Olaf Spinczyk.Multi-layer software reliability for unreliable hardware. 57 3, pages 170--180 2015[BibTeX][PDF][Link][Abstract]@article { DBLP:journals/it/ShafiqueABCCDEH15, author = {Shafique, Muhammad and Axer, Philip and Borchert, Christoph and Chen, Jian-Jia and Chen, Kuan-Hsun and D{\"{o}}bel, Bj{\"{o}}rn and Ernst, Rolf and H{\"{a}}rtig, Hermann and Heinig, Andreas and Kapitza, R{\"{u}}diger and Kriebel, Florian and Lohmann, Daniel and Marwedel, Peter and Rehman, Semeen and Schmoll, Florian and Spinczyk, Olaf}, title = {Multi-layer software reliability for unreliable hardware}, year = {2015}, volume = {57}, number = {3}, pages = {170--180}, url = {http://dx.doi.org/10.1515/itit-2014-1081}, keywords = {kuan}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/itit-2014-1081.pdf}, confidential = {n}, abstract = {This paper presents a multi-layer software reliability approach that leverages multiple software layers (e.g., programming language, compiler, and operating system) to improve the overall system reliability considering unreliable or partly-reliable hardware. We present a comprehensive design flow that integrates multiple software layers while accounting for the knowledge from lower hardware layers. We show how multiple software layers synergistically operate to achieve a high degree of reliability. }, }This paper presents a multi-layer software reliability approach that leverages multiple software layers (e.g., programming language, compiler, and operating system) to improve the overall system reliability considering unreliable or partly-reliable hardware. We present a comprehensive design flow that integrates multiple software layers while accounting for the knowledge from lower hardware layers. We show how multiple software layers synergistically operate to achieve a high degree of reliability. Helena Kotthaus, Ingo Korb, Michel Lang, Bernd Bischl, Jörg Rahnenführer and Peter Marwedel.Runtime and Memory Consumption Analyses for Machine Learning R Programs. Journal of Statistical Computation and Simulation 85 1, pages 14-29 2014[BibTeX][PDF][Link][Abstract]@article { kotthaus/2014a, author = {Kotthaus, Helena and Korb, Ingo and Lang, Michel and Bischl, Bernd and Rahnenf{\"u}hrer, J{\"o}rg and Marwedel, Peter}, title = {Runtime and Memory Consumption Analyses for Machine Learning R Programs}, journal = {Journal of Statistical Computation and Simulation}, year = {2014}, volume = {85}, number = {1}, pages = {14-29}, url = {http://www.tandfonline.com/eprint/T3mgYXAWdY4kWuDeSv2A/full}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2014_jscs_kotthaus.pdf}, confidential = {n}, abstract = {R is a multi-paradigm language with a dynamic type system, different object systems and functional characteristics. These characteristics support the development of statistical algorithms at a high level of abstraction. Although R is commonly used in the statistics domain a big disadvantage are its runtime problems when handling computation-intensive algorithms. Especially in the domain of machine learning the execution of pure R programs is often unacceptably slow. Our long-term goal is to resolve these issues and in this contribution we used the traceR tool to analyse the bottlenecks arising in this domain. Here we measured the runtime and overall memory consumption on a well-defined set of classical machine learning applications and gained detailed insights into the performance issues of these programs.}, }R is a multi-paradigm language with a dynamic type system, different object systems and functional characteristics. These characteristics support the development of statistical algorithms at a high level of abstraction. Although R is commonly used in the statistics domain a big disadvantage are its runtime problems when handling computation-intensive algorithms. Especially in the domain of machine learning the execution of pure R programs is often unacceptably slow. Our long-term goal is to resolve these issues and in this contribution we used the traceR tool to analyse the bottlenecks arising in this domain. Here we measured the runtime and overall memory consumption on a well-defined set of classical machine learning applications and gained detailed insights into the performance issues of these programs. Michel Lang, Helena Kotthaus, Peter Marwedel, Claus Weihs, Jörg Rahnenführer and Bernd Bischl.Automatic Model Selection for High-Dimensional Survival Analysis. Journal of Statistical Computation and Simulation 85 1, pages 62-76 2014[BibTeX][PDF][Link][Abstract]@article { kotthaus/2014d, author = {Lang, Michel and Kotthaus, Helena and Marwedel, Peter and Weihs, Claus and Rahnenf{\"u}hrer, J{\"o}rg and Bischl, Bernd}, title = {Automatic Model Selection for High-Dimensional Survival Analysis}, journal = {Journal of Statistical Computation and Simulation}, year = {2014}, volume = {85}, number = {1}, pages = {62-76}, url = {http://www.tandfonline.com/eprint/fwxNXMbwDakhMqRrdMsJ/full}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/jscs_2014_lang.pdf}, confidential = {n}, abstract = {Many different models for the analysis of high-dimensional survival data have been developed over the past years. While some of the models and implementations come with an internal parameter tuning automatism, others require the user to accurately adjust defaults, which often feels like a guessing game. Exhaustively trying out all model and parameter combinations will quickly become tedious or infeasible in computationally intensive settings, even if parallelization is employed. Therefore, we propose to use modern algorithm configuration techniques, e.g. iterated F-racing, to efficiently move through the model hypothesis space and to simultaneously configure algorithm classes and their respective hyperparameters. In our application we study four lung cancer microarray data sets. For these we configure a predictor based on five survival analysis algorithms in combination with eight feature selection filters. We parallelize the optimization and all comparison experiments with the BatchJobs and BatchExperiments R packages.}, }Many different models for the analysis of high-dimensional survival data have been developed over the past years. While some of the models and implementations come with an internal parameter tuning automatism, others require the user to accurately adjust defaults, which often feels like a guessing game. Exhaustively trying out all model and parameter combinations will quickly become tedious or infeasible in computationally intensive settings, even if parallelization is employed. Therefore, we propose to use modern algorithm configuration techniques, e.g. iterated F-racing, to efficiently move through the model hypothesis space and to simultaneously configure algorithm classes and their respective hyperparameters. In our application we study four lung cancer microarray data sets. For these we configure a predictor based on five survival analysis algorithms in combination with eight feature selection filters. We parallelize the optimization and all comparison experiments with the BatchJobs and BatchExperiments R packages. Timon Kelter, Heiko Falk, Peter Marwedel, Sudipta Chattopadhyay and Abhik Roychoudhury.Static Analysis of Multi-Core TDMA Resource Arbitration Delays. Real-Time Systems 50 2, pages pp 185-229 March 2014[BibTeX][Link]@article { kelter:2014:rts, author = {Kelter, Timon and Falk, Heiko and Marwedel, Peter and Chattopadhyay, Sudipta and Roychoudhury, Abhik}, title = {Static Analysis of Multi-Core TDMA Resource Arbitration Delays}, journal = {Real-Time Systems}, year = {2014}, volume = {50}, number = {2}, pages = {pp 185-229}, month = {March}, url = {http://link.springer.com/article/10.1007%2Fs11241-013-9189-x}, keywords = {wcet}, confidential = {n}, } Andreas Herkersdorf, Hananeh Aliee, Michael Engel, Michael Glaß, Christina Gimmler-Dumont, Jörg Henkel, Veit B Kleeberger, Michael A Kochte, Johannes M Kühn, Daniel Mueller-Gritschneder, Sani R Nassif, Holm Rauchfuss, Wolfgang Rosenstiel, Ulf Schlichtmann, Muhammad Shafique, Mehdi B Tahoori, Jürgen Teich, Norbert Wehn, Christian Weis and Hans-Joachim Wunderlich.Resilience Articulation Point (RAP): Cross-layer dependability modeling for nanometer system-on-chip resilience. Microelectronics Reliability 2014[BibTeX][Link][Abstract]@article { Herkersdorf:2014, author = {Herkersdorf, Andreas and Aliee, Hananeh and Engel, Michael and Gla\"s, Michael and Gimmler-Dumont, Christina and Henkel, J\"org and Kleeberger, Veit B and Kochte, Michael A and K\"uhn, Johannes M and Mueller-Gritschneder, Daniel and Nassif, Sani R and Rauchfuss, Holm and Rosenstiel, Wolfgang and Schlichtmann, Ulf and Shafique, Muhammad and Tahoori, Mehdi B and Teich, J\"urgen and Wehn, Norbert and Weis, Christian and Wunderlich, Hans-Joachim}, title = {Resilience Articulation Point (RAP): Cross-layer dependability modeling for nanometer system-on-chip resilience}, journal = {Microelectronics Reliability}, year = {2014}, url = {http://dx.doi.org/10.1016/j.microrel.2013.12.012}, keywords = {daes, ders}, confidential = {n}, abstract = {The Resilience Articulation Point (RAP) model aims at provisioning researchers and developers with a probabilistic fault abstraction and error propagation framework covering all hardware/software layers of a System on Chip. RAP assumes that physically induced faults at the technology or CMOS device layer will eventually manifest themselves as a single or multiple bit flip(s). When probabilistic error functions for specific fault origins are known at the bit or signal level, knowledge about the unit of design and its environment allow the transformation of the bit-related error functions into characteristic higher layer representations, such as error functions for data words, Finite State Machine (FSM) state, macro-interfaces or software variables. Thus, design concerns at higher abstraction layers can be investigated without the necessity to further consider the full details of lower levels of design. This paper introduces the ideas of RAP based on examples of radiation induced soft errors in SRAM cells, voltage variations and sequential CMOS logic. It shows by example how probabilistic bit flips are systematically abstracted and propagated towards higher abstraction levels up to the application software layer, and how RAP can be used to parameterize architecture-level resilience methods.}, }The Resilience Articulation Point (RAP) model aims at provisioning researchers and developers with a probabilistic fault abstraction and error propagation framework covering all hardware/software layers of a System on Chip. RAP assumes that physically induced faults at the technology or CMOS device layer will eventually manifest themselves as a single or multiple bit flip(s). When probabilistic error functions for specific fault origins are known at the bit or signal level, knowledge about the unit of design and its environment allow the transformation of the bit-related error functions into characteristic higher layer representations, such as error functions for data words, Finite State Machine (FSM) state, macro-interfaces or software variables. Thus, design concerns at higher abstraction layers can be investigated without the necessity to further consider the full details of lower levels of design. This paper introduces the ideas of RAP based on examples of radiation induced soft errors in SRAM cells, voltage variations and sequential CMOS logic. It shows by example how probabilistic bit flips are systematically abstracted and propagated towards higher abstraction levels up to the application software layer, and how RAP can be used to parameterize architecture-level resilience methods. Sudipta Chattopadhyay, Lee Kee Chong, Abhik Roychoudhury, Timon Kelter, Peter Marwedel and Heiko Falk.A Unified WCET Analysis Framework for Multi-core Platforms. ACM Transactions on Embedded Computing Systems (TECS) 13 4s July 2014[BibTeX][Link][Abstract]@article { kelter:2014:tecs, author = {Chattopadhyay, Sudipta and Chong, Lee Kee and Roychoudhury, Abhik and Kelter, Timon and Marwedel, Peter and Falk, Heiko}, title = {A Unified WCET Analysis Framework for Multi-core Platforms}, journal = {ACM Transactions on Embedded Computing Systems (TECS)}, year = {2014}, volume = {13}, number = {4s}, month = {July}, url = {http://dl.acm.org/citation.cfm?id=2584654}, keywords = {wcet}, confidential = {n}, abstract = {With the advent of multi-core architectures, worst case execution time (WCET) analysis has become an increasingly difficult problem. In this paper, we propose a unified WCET analysis framework for multi-core processors featuring both shared cache and shared bus. Compared to other previous works, our work differs by modeling the interaction of shared cache and shared bus with other basic micro-architectural components (e.g. pipeline and branch predictor). In addition, our framework does not assume a timing anomaly free multi-core architecture for computing the WCET. A detailed experiment methodology suggests that we can obtain reasonably tight WCET estimates in a wide range of benchmark programs.}, }With the advent of multi-core architectures, worst case execution time (WCET) analysis has become an increasingly difficult problem. In this paper, we propose a unified WCET analysis framework for multi-core processors featuring both shared cache and shared bus. Compared to other previous works, our work differs by modeling the interaction of shared cache and shared bus with other basic micro-architectural components (e.g. pipeline and branch predictor). In addition, our framework does not assume a timing anomaly free multi-core architecture for computing the WCET. A detailed experiment methodology suggests that we can obtain reasonably tight WCET estimates in a wide range of benchmark programs. Dominic Siedhoff, Pascal Libuschewski, Frank Weichert, Alexander Zybin, Peter Marwedel and Heinrich Müller.Modellierung und Optimierung eines Biosensors zur Detektion viraler Strukturen. Bildverarbeitung für die Medizin. Lecture Notes in Informatics. Springer-Verlag, pages 108-113 2014[BibTeX][PDF][Link][Abstract]@article { Siedhoff/etal/2014a, author = {Siedhoff, Dominic and Libuschewski, Pascal and Weichert, Frank and Zybin, Alexander and Marwedel, Peter and M\"uller, Heinrich}, title = {Modellierung und Optimierung eines Biosensors zur Detektion viraler Strukturen}, journal = {Bildverarbeitung f\"ur die Medizin. Lecture Notes in Informatics. Springer-Verlag}, year = {2014}, pages = {108-113}, url = {http://link.springer.com/chapter/10.1007/978-3-642-54111-7_24}, file = {http://link.springer.com/content/pdf/10.1007%2F978-3-642-54111-7_24.pdf}, confidential = {n}, abstract = {Die echtzeitf\"ahige Detektion mannigfaltiger viraler Strukturen gewinnt zunehmend an Bedeutung. Hier setzt die vorliegende Arbeit an, welche die adaptive Modellierung und Optimierung eines Biosensors vorstellt und zur automatischen Synthese von segmentierten Trainingsdaten nutzt, was den manuellen Aufwand zur Adaption an unterschiedliche Virustypen nachhaltig reduziert. Im vorliegenden Anwendungsfall des PAMONO-Sensors werden \"uber diesen Ansatz die Parameter eines GPGPU-basierten Objekt-Detektors genetisch optimiert. Die G\"ute des Ansatzes zeigt sich bei der \"Ubertragung der optimierten Parameter auf reale Eingabedaten: Die Qualit\"atsma\"se Precision und Recall erreichen Werte gr\"o\"ser als 0.92.}, }Die echtzeitfähige Detektion mannigfaltiger viraler Strukturen gewinnt zunehmend an Bedeutung. Hier setzt die vorliegende Arbeit an, welche die adaptive Modellierung und Optimierung eines Biosensors vorstellt und zur automatischen Synthese von segmentierten Trainingsdaten nutzt, was den manuellen Aufwand zur Adaption an unterschiedliche Virustypen nachhaltig reduziert. Im vorliegenden Anwendungsfall des PAMONO-Sensors werden über diesen Ansatz die Parameter eines GPGPU-basierten Objekt-Detektors genetisch optimiert. Die Güte des Ansatzes zeigt sich bei der \"Ubertragung der optimierten Parameter auf reale Eingabedaten: Die Qualitätsmaße Precision und Recall erreichen Werte größer als 0.92. Santiago Pagani and Jian{-}Jia Chen.Energy Efficiency Analysis for the Single Frequency Approximation {(SFA)} Scheme. {ACM} Trans. Embedded Comput. Syst. 13 5s, pages 158:1--158:25 2014[BibTeX][Link]@article { DBLP:journals/tecs/PaganiC14, author = {Pagani, Santiago and Chen, Jian{-}Jia}, title = {Energy Efficiency Analysis for the Single Frequency Approximation {(SFA)} Scheme}, journal = {{ACM} Trans. Embedded Comput. Syst.}, year = {2014}, bdsk-url-1 = {http://doi.acm.org/10.1145/2660490}, bdsk-url-2 = {http://dx.doi.org/10.1145/2660490}, volume = {13}, number = {5s}, pages = {158:1--158:25}, url = {http://doi.acm.org/10.1145/2660490}, confidential = {n}, } Anas Toma and Jian-Jia Chen.Computation Offloading for Frame-Based Real-Time Tasks under Given Server Response Time Guarantees. LITES 1 2, pages 02:1--02:21 2014[BibTeX][Link]@article { DBLP:journals/lites/TomaC14, author = {Toma, Anas and Chen, Jian-Jia}, title = {Computation Offloading for Frame-Based Real-Time Tasks under Given Server Response Time Guarantees}, journal = {LITES}, year = {2014}, bdsk-url-1 = {http://dx.doi.org/10.4230/LITES-v001-i002-a002}, volume = {1}, number = {2}, pages = {02:1--02:21}, url = {http://dx.doi.org/10.4230/LITES-v001-i002-a002}, confidential = {n}, } Jens Nellesen, Frank Weichert, Constantin Timm, Rudolf Alexander Kerbitz and Wolfgang Tillmann.Trade-Off Analysis considering Tomogram Quality and Performance of a Parallel Computing Hardware Realization of Katsevich{\textquoteright}s Reconstruction Algorithm. IEEE Transactions on Nuclear Science 60 5, pages 3270-3281 Oct. 2013, Publikation[BibTeX][Link]@article { 3009, author = {Nellesen, Jens and Weichert, Frank and Timm, Constantin and Kerbitz, Rudolf Alexander and Tillmann, Wolfgang}, title = {Trade-Off Analysis considering Tomogram Quality and Performance of a Parallel Computing Hardware Realization of Katsevich{\textquoteright}s Reconstruction Algorithm}, journal = {IEEE Transactions on Nuclear Science}, year = {2013}, volume = {60}, number = {5}, pages = {3270-3281}, month = {Oct.}, note = {Publikation}, url = {http://dx.doi.org/10.1109/TNS.2013.2265660}, confidential = {n}, } Ahmed A. Eltawil, Michael Engel, Bibiche Geuskens, Amin Khajeh Djahromi, Fadi J Kurdahi, Peter Marwedel, Smail Niar and Mazen A. R. Saghir.A Survey of Cross-Layer Power-Performance-Reliability in Multi and Many Core Systems-on-Chip. Embedded Hardware Design: Microprocessors and Microsystems 2013, Invited paper[BibTeX][Link][Abstract]@article { kurdahi:2013:EHD, author = {Eltawil, Ahmed A. and Engel, Michael and Geuskens, Bibiche and Djahromi, Amin Khajeh and Kurdahi, Fadi J and Marwedel, Peter and Niar, Smail and Saghir, Mazen A. R.}, title = {A Survey of Cross-Layer Power-Performance-Reliability in Multi and Many Core Systems-on-Chip}, journal = {Embedded Hardware Design: Microprocessors and Microsystems}, year = {2013}, note = {Invited paper}, url = {http://www.sciencedirect.com/science/article/pii/S0141933113000987}, keywords = {ders}, confidential = {n}, abstract = {As systems-on-chip increase in complexity, the underlying technology presents us with significant challenges due to increased power consumption as well as decreased reliability. Today, designers must consider building systems that achieve the requisite functionality and performance using components that may be unreliable. In order to do so, it is crucial to understand the close interplay between the different layers of a system: technology, platform, and application. This will enable the most general tradeoff exploration, reaping the most benefits in power, performance and reliability. This paper surveys various cross layer techniques and approaches for power, performance, and reliability tradeoffs are technology, circuit, architecture and application layers. }, }As systems-on-chip increase in complexity, the underlying technology presents us with significant challenges due to increased power consumption as well as decreased reliability. Today, designers must consider building systems that achieve the requisite functionality and performance using components that may be unreliable. In order to do so, it is crucial to understand the close interplay between the different layers of a system: technology, platform, and application. This will enable the most general tradeoff exploration, reaping the most benefits in power, performance and reliability. This paper surveys various cross layer techniques and approaches for power, performance, and reliability tradeoffs are technology, circuit, architecture and application layers. Florian Schmoll, Andreas Heinig, Peter Marwedel and Michael Engel.Improving the Fault Resilience of an H.264 Decoder using Static Analysis Methods. ACM Transactions on Embedded Computing Systems (TECS) 13 1s, pages 31:1--31:27 December 2013[BibTeX][Link][Abstract]@article { schmoll:2013:tecs, author = {Schmoll, Florian and Heinig, Andreas and Marwedel, Peter and Engel, Michael}, title = {Improving the Fault Resilience of an H.264 Decoder using Static Analysis Methods}, journal = {ACM Transactions on Embedded Computing Systems (TECS)}, year = {2013}, volume = {13}, number = {1s}, pages = {31:1--31:27}, month = {dec}, url = {http://doi.acm.org/10.1145/2536747.2536753}, keywords = {ders}, confidential = {n}, abstract = {Fault tolerance rapidly evolves into one of the most significant design objectives for embedded systems due to reduced semiconductor structures and supply voltages. However, resource-constrained systems cannot afford traditional error correction for overhead and cost reasons. New methods are required to sustain acceptable service quality in case of errors while avoiding crashes. We present a flexible fault-tolerance approach that is able to select correction actions depending on error semantics using application annotations and static analysis approaches. We verify the validity of our approach by analyzing the vulnerability and improving the reliability of an H.264 decoder using flexible error handling.}, }Fault tolerance rapidly evolves into one of the most significant design objectives for embedded systems due to reduced semiconductor structures and supply voltages. However, resource-constrained systems cannot afford traditional error correction for overhead and cost reasons. New methods are required to sustain acceptable service quality in case of errors while avoiding crashes. We present a flexible fault-tolerance approach that is able to select correction actions depending on error semantics using application annotations and static analysis approaches. We verify the validity of our approach by analyzing the vulnerability and improving the reliability of an H.264 decoder using flexible error handling. Horst Schirmeier, Ingo Korb, Olaf Spinczyk and Michael Engel.Efficient Online Memory Error Assessment and Circumvention for Linux with RAMpage. International Journal of Critical Computer-Based Systems Special Issue on PRDC 2011 Dependable Architecture and Analysis 2013[BibTeX][Link][Abstract]@article { schirmeier:2013:ijccbs, author = {Schirmeier, Horst and Korb, Ingo and Spinczyk, Olaf and Engel, Michael}, title = {Efficient Online Memory Error Assessment and Circumvention for Linux with RAMpage}, journal = {International Journal of Critical Computer-Based Systems}, year = {2013}, volume = {Special Issue on PRDC 2011 Dependable Architecture and Analysis}, url = {http://www.inderscience.com/info/ingeneral/forthcoming.php?jcode=ijccbs}, keywords = {ders}, confidential = {n}, abstract = {Memory errors are a major source of reliability problems in computer systems. Undetected errors may result in program termination or, even worse, silent data corruption. Recent studies have shown that the frequency of permanent memory errors is an order of magnitude higher than previously assumed and regularly affects everyday operation. To reduce the impact of memory errors, we designed RAMpage, a purely software-based infrastructure to assess and circumvent permanent memory errors in a running commodity x86-64 Linux-based system. We briefly describe the design and implementation of RAMpage and present new results from an extensive qualitative and quantitative evaluation. These results show the efficiency of our approach -- RAMpage is able to provide a smooth graceful degradation in case of permanent memory errors while requiring only a small overhead in terms of CPU time, energy, and memory space. }, }Memory errors are a major source of reliability problems in computer systems. Undetected errors may result in program termination or, even worse, silent data corruption. Recent studies have shown that the frequency of permanent memory errors is an order of magnitude higher than previously assumed and regularly affects everyday operation. To reduce the impact of memory errors, we designed RAMpage, a purely software-based infrastructure to assess and circumvent permanent memory errors in a running commodity x86-64 Linux-based system. We briefly describe the design and implementation of RAMpage and present new results from an extensive qualitative and quantitative evaluation. These results show the efficiency of our approach -- RAMpage is able to provide a smooth graceful degradation in case of permanent memory errors while requiring only a small overhead in terms of CPU time, energy, and memory space. Pascal Libuschewski, Dominic Siedhoff, Constantin Timm and Frank Weichert.Mobile Detektion viraler Pathogene durch echtzeitfähige GPGPU-Fuzzy-Segmentierung. Bildverarbeitung für die Medizin, pages 326-331 March 2013[BibTeX][PDF][Link][Abstract]@article { Libuschewski/etal/2013a, author = {Libuschewski, Pascal and Siedhoff, Dominic and Timm, Constantin and Weichert, Frank}, title = {Mobile Detektion viraler Pathogene durch echtzeitf{\"a}hige GPGPU-Fuzzy-Segmentierung}, journal = {Bildverarbeitung f{\"u}r die Medizin}, year = {2013}, pages = {326-331}, month = {March}, url = {http://www.springer.com/computer/image+processing/book/978-3-642-36479-2}, file = {http://link.springer.com/content/pdf/10.1007%2F978-3-642-36480-8_57.pdf}, confidential = {n}, abstract = {Die vorliegende Arbeit stellt einen neuartigen Fuzzy-Logik-basierten Segmentierungsalgorithmus zur Detektion von biologischen Viren in stark Artefakt-behafteten Bildsequenzen vor, der konform ist zu den differenzierten Ressourcenbeschr\"ankungen mobiler Endger\"ate. Als Sensor kommt der neuartige PAMONO-Biosensor zum indirekten Nachweis von Viren mittels optischer Mikroskopie zum Einsatz. Die Segmentierungen weisen eine hohe positive \"Ubereinstimmung bei idealisierten synthetischen Segmentierungen auf, die durch den Fuzzy-Ansatz insbesondere bei kleinen Viren/schlechtem Signal-Rausch-Verh\"altnis nochmals verbessert wird. Ferner wird gezeigt, dass eine GPU-gest\"utzte Datenanalyse die Detektion viraler Strukturen in Echtzeit auf mobilen Endger\"aten erm\"oglicht, und im Vergleich zur CPU den Energieverbrauch im Durchschnitt um Faktor 3.7 senkt.}, }Die vorliegende Arbeit stellt einen neuartigen Fuzzy-Logik-basierten Segmentierungsalgorithmus zur Detektion von biologischen Viren in stark Artefakt-behafteten Bildsequenzen vor, der konform ist zu den differenzierten Ressourcenbeschränkungen mobiler Endgeräte. Als Sensor kommt der neuartige PAMONO-Biosensor zum indirekten Nachweis von Viren mittels optischer Mikroskopie zum Einsatz. Die Segmentierungen weisen eine hohe positive \"Ubereinstimmung bei idealisierten synthetischen Segmentierungen auf, die durch den Fuzzy-Ansatz insbesondere bei kleinen Viren/schlechtem Signal-Rausch-Verhältnis nochmals verbessert wird. Ferner wird gezeigt, dass eine GPU-gestützte Datenanalyse die Detektion viraler Strukturen in Echtzeit auf mobilen Endgeräten ermöglicht, und im Vergleich zur CPU den Energieverbrauch im Durchschnitt um Faktor 3.7 senkt. Pascal Libuschewski, Dominic Siedhoff and Frank Weichert.Energy-aware Design Space Exploration for GPGPUs. Computer Science - Research and Development, pages 1-6 2013, DOI: 10.1007/s00450-013-0237-5[BibTeX][PDF][Link][Abstract]@article { Libuschewski/etal/2013c, author = {Libuschewski, Pascal and Siedhoff, Dominic and Weichert, Frank}, title = {Energy-aware Design Space Exploration for GPGPUs}, journal = {Computer Science - Research and Development}, year = {2013}, pages = {1-6}, note = {DOI: 10.1007/s00450-013-0237-5}, url = {http://dx.doi.org/10.1007/s00450-013-0237-5}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2013_libuschewski_c.pdf}, confidential = {n}, abstract = {This work presents a novel approach for automatically determining the most power- or energy-efficient Graphics Processing Units (GPUs) with respect to given parallel computation problems.}, }This work presents a novel approach for automatically determining the most power- or energy-efficient Graphics Processing Units (GPUs) with respect to given parallel computation problems. Constantin Timm, Frank Weichert, Christian Prasse, Heinrich Müller, Michael Hompel and Peter Marwedel.Efficient Resource Management based on Non-Functional Requirements for Sensor/Actuator Networks. Ninth International Network Conference (INC) 2012 , pages 207-217 July 2012[BibTeX][Abstract]@article { timm:2012, author = {Timm, Constantin and Weichert, Frank and Prasse, Christian and M{\"u}ller, Heinrich and Hompel, Michael and Marwedel, Peter}, title = {Efficient Resource Management based on Non-Functional Requirements for Sensor/Actuator Networks}, journal = {Ninth International Network Conference (INC) 2012 }, year = {2012}, pages = {207-217}, month = {July}, confidential = {n}, abstract = {In this paper, a novel resource management approach is presented for publish-subscribe middleware for sensor/actuator networks. The resource management was designed with the possibility to add non-functional requirements at runtime to subscription messages. This approach allows utilizing service level agreements that can then be employed in order to guarantee a certain quality of service or to reduce the energy consumption of a sensor node in a sensor/actuator network. As an example, a sensor/actuator network for facility logistics system (a conveyor belt system) is evaluated with respect to energy consumption. This sensor/actuator network is mostly controlled by image processing based sensor nodes. It is shown that an adaptive processing interval for these sensor nodes can reduce the energy consumption of the entire network. The utilization of non-functional requirements allows the system to adapt -- after software development -- to context changes such as the extension of the conveyor belt systems topology. }, }In this paper, a novel resource management approach is presented for publish-subscribe middleware for sensor/actuator networks. The resource management was designed with the possibility to add non-functional requirements at runtime to subscription messages. This approach allows utilizing service level agreements that can then be employed in order to guarantee a certain quality of service or to reduce the energy consumption of a sensor node in a sensor/actuator network. As an example, a sensor/actuator network for facility logistics system (a conveyor belt system) is evaluated with respect to energy consumption. This sensor/actuator network is mostly controlled by image processing based sensor nodes. It is shown that an adaptive processing interval for these sensor nodes can reduce the energy consumption of the entire network. The utilization of non-functional requirements allows the system to adapt -- after software development -- to context changes such as the extension of the conveyor belt systems topology. Constantin Timm, Markus Görlich, Frank Weichert, Peter Marwedel and Heinrich Müller.Feedback-Based Global Instruction Scheduling for GPGPU Applications. 12th International Conference Computational Science and Its Applications 2012 (ICCSA) June 2012[BibTeX][PDF][Abstract]@article { timm:2012a, author = {Timm, Constantin and G{\"o}rlich, Markus and Weichert, Frank and Marwedel, Peter and M{\"u}ller, Heinrich}, title = {Feedback-Based Global Instruction Scheduling for GPGPU Applications}, journal = {12th International Conference Computational Science and Its Applications 2012 (ICCSA) }, year = {2012}, month = {June}, file = {http://www.springerlink.com/content/6q51h75768637644/fulltext.pdf}, confidential = {n}, abstract = {In the face of the memory wall even in high bandwidth systems such as GPUs, an efficient handling of memory accesses and memory-related instructions is mandatory. Up to now, memory performance considerations were only made for GPGPU applications at source code level. This is not enough when optimizing an application towards high performance: The code has to be optimized at assembly level as well. Due to the spreading of GPGPU-capable hardware in smaller and smaller devices, the energy consumption of a program is – besides the performance – an important optimization goal. In this paper, a novel compiler optimization technique, called FALIS (Feedback-based and memory-Aware gLobal Instruction Scheduling), is presented based on global instruction scheduling and multi-objective genetic algorithms. The approach uses a profiling-based feedback in order to take the measured performance and energy consumption values inside a compiler into account. Profiling on the real hardware platform is important in order to consider the characteristics of the underlying hardware. FALIS increases runtime performance of a GPGPU application by up to 13.02% and decreases energy consumption by up to 10.23%. }, }In the face of the memory wall even in high bandwidth systems such as GPUs, an efficient handling of memory accesses and memory-related instructions is mandatory. Up to now, memory performance considerations were only made for GPGPU applications at source code level. This is not enough when optimizing an application towards high performance: The code has to be optimized at assembly level as well. Due to the spreading of GPGPU-capable hardware in smaller and smaller devices, the energy consumption of a program is – besides the performance – an important optimization goal. In this paper, a novel compiler optimization technique, called FALIS (Feedback-based and memory-Aware gLobal Instruction Scheduling), is presented based on global instruction scheduling and multi-objective genetic algorithms. The approach uses a profiling-based feedback in order to take the measured performance and energy consumption values inside a compiler into account. Profiling on the real hardware platform is important in order to consider the characteristics of the underlying hardware. FALIS increases runtime performance of a GPGPU application by up to 13.02% and decreases energy consumption by up to 10.23%. Heiko Falk and Peter Marwedel.Introduction to the Special Section on SCOPES'09. ACM Transactions on Embedded Computing Systems (TECS) 11S 1, pages 17:1--17:3 June 2012[BibTeX][Link]@article { falk:2012:tecs, author = {Falk, Heiko and Marwedel, Peter}, title = {Introduction to the Special Section on SCOPES'09}, journal = {ACM Transactions on Embedded Computing Systems (TECS)}, year = {2012}, volume = {11S}, number = {1}, pages = {17:1--17:3}, month = {jun}, url = {http://doi.acm.org/10.1145/2180887.2180894}, confidential = {n}, } Pascal Libuschewski, Frank Weichert and Constantin Timm.Parameteroptimierte und GPGPU-basierte Detektion viraler Strukturen innerhalb Plasmonen-unterstützter Mikroskopiedaten. Bildverarbeitung für die Medizin, Springer Verlag 237-242 March 2012[BibTeX][PDF][Link][Abstract]@article { libuschewski/etal/2012a, author = {Libuschewski, Pascal and Weichert, Frank and Timm, Constantin}, title = {Parameteroptimierte und GPGPU-basierte Detektion viraler Strukturen innerhalb Plasmonen-unterst{\"u}tzter Mikroskopiedaten}, journal = {Bildverarbeitung f{\"u}r die Medizin, Springer Verlag}, year = {2012}, number = {237-242 }, month = {March}, url = {http://link.springer.com/chapter/10.1007/978-3-642-28502-8_42}, file = {http://link.springer.com/content/pdf/10.1007%2F978-3-642-28502-8_42.pdf}, confidential = {n}, abstract = {Die lokale Verf\"ugbarkeit von effizienten und leistungsf \"ahigen Biosensoren, z.B. an Flugh\"afen, gewinnt durch die zunehmende Verbreitung viraler Infektionen zunehmend an Bedeutung. Die zentralen Herausforderungen f\"ur entsprechende in situ Virusdetektionssysteme sind eine schnelle und sichere Erkennung der Viren respektive die Adaptivit\"at an unterschiedliche Auspr\"agungen von Erregern. Optische Verfahren, wie die neuartige Plasmonen-unterst\"utzte Mikroskopie von Nanoobjekten erlauben es, diesen Anforderungen zu entsprechen. Aufgrund starker multipler Artefaktbelastung des Signals und hohen Datenmengen (zeitlichen und \"ortlichen), werden nachhaltige Anforderungen an die Bildrestauration und -analyse gestellt. Hier setzt die vorliegende Arbeit an, welche eine GPGPU-basierte Bildrestaurations- und Bildanalysepipeline vorstellt. \"Uber eine Kombination aus lokaler und globaler Parameteroptimierung mittels Genetischer Algorithmen kann eine h\"ohere Effektivit\"at der einzelnen Stufen der Verarbeitung erzielt werden, aber auch im \"ubergreifenden Verbund – dies zeigt sich nachhaltig in der Erkennungsrate des Biosensors f\"ur Viren.}, }Die lokale Verfügbarkeit von effizienten und leistungsf ähigen Biosensoren, z.B. an Flughäfen, gewinnt durch die zunehmende Verbreitung viraler Infektionen zunehmend an Bedeutung. Die zentralen Herausforderungen für entsprechende in situ Virusdetektionssysteme sind eine schnelle und sichere Erkennung der Viren respektive die Adaptivität an unterschiedliche Ausprägungen von Erregern. Optische Verfahren, wie die neuartige Plasmonen-unterstützte Mikroskopie von Nanoobjekten erlauben es, diesen Anforderungen zu entsprechen. Aufgrund starker multipler Artefaktbelastung des Signals und hohen Datenmengen (zeitlichen und örtlichen), werden nachhaltige Anforderungen an die Bildrestauration und -analyse gestellt. Hier setzt die vorliegende Arbeit an, welche eine GPGPU-basierte Bildrestaurations- und Bildanalysepipeline vorstellt. \"Uber eine Kombination aus lokaler und globaler Parameteroptimierung mittels Genetischer Algorithmen kann eine höhere Effektivität der einzelnen Stufen der Verarbeitung erzielt werden, aber auch im übergreifenden Verbund – dies zeigt sich nachhaltig in der Erkennungsrate des Biosensors für Viren. Constantin Timm, Frank Weichert, Peter Marwedel and Heinrich Müller.Design Space Exploration Towards a Realtime and Energy-Aware GPGPU-based Analysis of Biosensor Data. Computer Science - Research and Development, Special Issue "International Conference on Energy-Aware High Performance Computing (ENA-HPC)" September 2011[BibTeX][PDF][Abstract]@article { Timm:2011c, author = {Timm, Constantin and Weichert, Frank and Marwedel, Peter and M{\"u}ller, Heinrich}, title = {Design Space Exploration Towards a Realtime and Energy-Aware GPGPU-based Analysis of Biosensor Data}, journal = {Computer Science - Research and Development, Special Issue International Conference on Energy-Aware High Performance Computing (ENA-HPC)''}, year = {2011}, month = {September}, file = {http://dx.doi.org/10.1007/s00450-011-0187-8}, confidential = {n}, abstract = {In this paper, novel objectives for the design space exploration of GPGPU applications are presented. The design space exploration takes the combination of energy efficiency and realtime requirements into account. This is completely different to the commonest high performance computing objective, which is to accelerate an application as much as possible. As a proof-of-concept, a GPGPU based image processing and virus detection pipeline for a newly developed biosensor, called PAMONO, is presented. The importance of realtime capable and portable biosensors increases according to rising number of worldwide spreading virus infections. The local availability of biosensors at e.g. airports to detect viruses in-situ demand to take costs and energy for the development of GPGPU-based biosensors into account. The consideration of the energy is especially important with respect to green computing. The results of the conducted design space exploration show that during the design process of a GPGPU-based application the platform must also be evaluated to get the most energy-aware solution. In particular, it was shown that increasing numbers of parallel running cores need not decrease the energy consumption. }, }In this paper, novel objectives for the design space exploration of GPGPU applications are presented. The design space exploration takes the combination of energy efficiency and realtime requirements into account. This is completely different to the commonest high performance computing objective, which is to accelerate an application as much as possible. As a proof-of-concept, a GPGPU based image processing and virus detection pipeline for a newly developed biosensor, called PAMONO, is presented. The importance of realtime capable and portable biosensors increases according to rising number of worldwide spreading virus infections. The local availability of biosensors at e.g. airports to detect viruses in-situ demand to take costs and energy for the development of GPGPU-based biosensors into account. The consideration of the energy is especially important with respect to green computing. The results of the conducted design space exploration show that during the design process of a GPGPU-based application the platform must also be evaluated to get the most energy-aware solution. In particular, it was shown that increasing numbers of parallel running cores need not decrease the energy consumption. Dominic Siedhoff, Frank Weichert, Pascal Libuschewski and Constantin Timm.Detection and Classification of Nano-Objects in Biosensor Data. Microscopic Image Analysis with Applications in Biology (MIAAB 2011) September 2011[BibTeX][PDF][Abstract]@article { Siedhoff:2011, author = {Siedhoff, Dominic and Weichert, Frank and Libuschewski, Pascal and Timm, Constantin}, title = {Detection and Classification of Nano-Objects in Biosensor Data}, journal = {Microscopic Image Analysis with Applications in Biology (MIAAB 2011)}, year = {2011}, month = {September}, file = {http://www.miaab.org/miaab-2011-heidelberg-papers/miaab-2011-h-siedhoff.pdf}, confidential = {n}, abstract = {Preventing viral infections from spreading quickly in a heavily connected world demands for reliable diagnostic methods providing results promptly. The PAMONO biosensor (Plasmon Assisted Microscopy Of Nano-Objects) is a novel technique capable of attaining these properties. In this paper, a processing pipeline is proposed for analyzing PAMONO sensor data. Firstly, virus candidate pixels are detected in the sensor output by matching their series of measured intensities over time to characteristic patterns. This is achieved in a fully parallel GPGPU approach. Each spatially coherent set of matching pixels defines a candidate object, represented as a polygon. The overall set of polygons encompasses true virus polygons as well as several types of false detections. The polygons are classified based on their shape, separating the viruses from false detections. With regard to this step, the suitability of different geometrical features (form-factors) and classification methods is explored. A set of classifiers consisting of Naive Bayes, RIPPER rule induction, C4.5 Decision Trees, $k$-Nearest-Neighbor and Support Vector Machines (SVM) is applied to a two- and to a multi-class formulation of the classification problem. Training and searching for robust optimal parameterizations of the parametric classifiers is achieved in an offline step, using evolutionary optimization. Furthermore, an evolutionary feature-selection is conducted for all classifiers. The classification performances are evaluated for different types of polystyrene nano-particles. With regard to the diverse nature of the encountered artifacts, a one-class SVM approach, learning from positive examples only, is an attractive option. As will be shown, it can compete with the two- and multi-class approaches if training and test data originate from the same type of nano-particles. }, }Preventing viral infections from spreading quickly in a heavily connected world demands for reliable diagnostic methods providing results promptly. The PAMONO biosensor (Plasmon Assisted Microscopy Of Nano-Objects) is a novel technique capable of attaining these properties. In this paper, a processing pipeline is proposed for analyzing PAMONO sensor data. Firstly, virus candidate pixels are detected in the sensor output by matching their series of measured intensities over time to characteristic patterns. This is achieved in a fully parallel GPGPU approach. Each spatially coherent set of matching pixels defines a candidate object, represented as a polygon. The overall set of polygons encompasses true virus polygons as well as several types of false detections. The polygons are classified based on their shape, separating the viruses from false detections. With regard to this step, the suitability of different geometrical features (form-factors) and classification methods is explored. A set of classifiers consisting of Naive Bayes, RIPPER rule induction, C4.5 Decision Trees, $k$-Nearest-Neighbor and Support Vector Machines (SVM) is applied to a two- and to a multi-class formulation of the classification problem. Training and searching for robust optimal parameterizations of the parametric classifiers is achieved in an offline step, using evolutionary optimization. Furthermore, an evolutionary feature-selection is conducted for all classifiers. The classification performances are evaluated for different types of polystyrene nano-particles. With regard to the diverse nature of the encountered artifacts, a one-class SVM approach, learning from positive examples only, is an attractive option. As will be shown, it can compete with the two- and multi-class approaches if training and test data originate from the same type of nano-particles. Frank Weichert, Constantin Timm, Marcel Gaspar, Alexander Zybin, Evgeny L. Gurevich, Heinrich Müller and Peter Marwedel.GPGPU-basierte Echtzeitdetektion von Nanoobjekten mittels Plasmonen-unterstützter Mikroskopie. Bildverarbeitung für die Medizin, pages 39-43 March 2011[BibTeX][PDF][Abstract]@article { Weichert:2011a, author = {Weichert, Frank and Timm, Constantin and Gaspar, Marcel and Zybin, Alexander and Gurevich, Evgeny L. and M{\"u}ller, Heinrich and Marwedel, Peter}, title = {GPGPU-basierte Echtzeitdetektion von Nanoobjekten mittels Plasmonen-unterst{\"u}tzter Mikroskopie}, journal = {Bildverarbeitung f{\"u}r die Medizin}, year = {2011}, pages = {39-43}, month = {mar}, file = {http://ceur-ws.org/Vol-715/bvm2011_10.pdf}, confidential = {n}, abstract = {Die Verf\"ugbarkeit echtzeitf\"ahiger und mobiler Biosensoren gewinnt durch die zunehmende Verbreitung viraler Infektionen zunehmend an Bedeutung. Im Gegensatz zu Virusdetektionsmethoden wie beispielsweise ELISA erlaubt die neuartige Plasmonen-unterst\"utzte Mikroskopie von Nanoobjekten, Proben innerhalb von wenigen Minuten auf Viren analysieren zu k\"onnen. Die Herausforderung f\"ur ein, auf dieser Analysemethode beruhendes In-situ-Virusdetektionssystem, besteht in der Echtzeitverarbeitung von extrem hohen Datenmengen. Hier setzt die vorliegende Arbeit an, welche eine hoch parallele GPU-basierte Verarbeitungspipeline zur echtzeitf\"ahigen Virusdetektion vorstellt. Durch die konsequente Ausnutzung der GPGPU-F\"ahigkeiten von Grafikkarten kann auf teure Spezialhardware verzichtet werden, um eine echtzeitkonforme Beschleunigung notwendiger Bildverarbeitungs- und Bildanalysealgorithmen bereitzustellen, die auch den Anforderungen an ein eingebettetes Virusdetektionssystem gerecht wird.}, }Die Verfügbarkeit echtzeitfähiger und mobiler Biosensoren gewinnt durch die zunehmende Verbreitung viraler Infektionen zunehmend an Bedeutung. Im Gegensatz zu Virusdetektionsmethoden wie beispielsweise ELISA erlaubt die neuartige Plasmonen-unterstützte Mikroskopie von Nanoobjekten, Proben innerhalb von wenigen Minuten auf Viren analysieren zu können. Die Herausforderung für ein, auf dieser Analysemethode beruhendes In-situ-Virusdetektionssystem, besteht in der Echtzeitverarbeitung von extrem hohen Datenmengen. Hier setzt die vorliegende Arbeit an, welche eine hoch parallele GPU-basierte Verarbeitungspipeline zur echtzeitfähigen Virusdetektion vorstellt. Durch die konsequente Ausnutzung der GPGPU-Fähigkeiten von Grafikkarten kann auf teure Spezialhardware verzichtet werden, um eine echtzeitkonforme Beschleunigung notwendiger Bildverarbeitungs- und Bildanalysealgorithmen bereitzustellen, die auch den Anforderungen an ein eingebettetes Virusdetektionssystem gerecht wird. Arindam Mallik, Stylianos Mamagkakis, Christos Baloukas, Lazaros Papadopoulos, Dimitrios Soudris, Sander Stuijk, Olivera Jovanovic, Florian Schmoll, Daniel Cordes, Robert Pyka, Peter Marwedel, François Capman, Séverin Collet, Nikolaos Mitas and Dimitrios Kritharidis.MNEMEE – An automated toolflow for parallelization and memory management in MPSoC platforms. User forum presentation, 48th Design Automation Conference (DAC), San Diego, California, USA June 2011[BibTeX][Abstract]@article { mnemee:dac:2011, author = {Mallik, Arindam and Mamagkakis, Stylianos and Baloukas, Christos and Papadopoulos, Lazaros and Soudris, Dimitrios and Stuijk, Sander and Jovanovic, Olivera and Schmoll, Florian and Cordes, Daniel and Pyka, Robert and Marwedel, Peter and Capman, Fran\c{c}ois and Collet, S\'everin and Mitas, Nikolaos and Kritharidis, Dimitrios}, title = {MNEMEE – An automated toolflow for parallelization and memory management in MPSoC platforms}, journal = {User forum presentation, 48th Design Automation Conference (DAC), San Diego, California, USA}, year = {2011}, month = {June}, confidential = {n}, abstract = {Mobile, intelligent devices that are able to deliver communication services and multimedia content anytime, anywhere are the dominant players in the field of embedded systems. These systems combine many different streaming applications (e.g., H.264-AVC, JPEG2000, WiMax) in a single system. The basic characteristic of these applications are typically large computational requirements and intensive data transfer and storage needs. As a result, the primary platform for such applications is Multiprocessor Systemson-Chip (MPSoCs). These MPSoCs can deliver the computational power required by novel applications. Modern MPSoCs contain a complex memory hierarchy that allows applications to meet their data transfer and storage requirements. However, it brings the additional challenge to the system designers to efficiently map applications onto processors and memories. The design choices have a large impact on the energy consumption and memory footprint of the final system. This in the end has a direct impact on the system cost and the battery lifetime of the system, i.e., the user experience. The MNEMEE project addresses the aforementioned challenges by introducing a novel tool flow that integrates several state-of-the-art source-to-source optimization methodologies and tools. It provides a methodology to automatically parallelize the source code of an application. It also optimizes the static and dynamic data structures in the source code such that they can efficiently use the memory hierarchy in an MPSoC. Finally, the tool flow maps the parallelized source code onto the processors and memories in an MPSoC. Many of the methodologies that are used in the tool flow are based on multi-objective exploration strategies. This allows designers to make design trade-offs and it makes product customization at design-time much easier. The MNEMEE tool flow provides a completely automated trajectory to map sequential applications onto an MPSoC while exploiting its memory hierarchy. The primary objective is to reduce the energy consumption and design-time of the new embedded system.}, }Mobile, intelligent devices that are able to deliver communication services and multimedia content anytime, anywhere are the dominant players in the field of embedded systems. These systems combine many different streaming applications (e.g., H.264-AVC, JPEG2000, WiMax) in a single system. The basic characteristic of these applications are typically large computational requirements and intensive data transfer and storage needs. As a result, the primary platform for such applications is Multiprocessor Systemson-Chip (MPSoCs). These MPSoCs can deliver the computational power required by novel applications. Modern MPSoCs contain a complex memory hierarchy that allows applications to meet their data transfer and storage requirements. However, it brings the additional challenge to the system designers to efficiently map applications onto processors and memories. The design choices have a large impact on the energy consumption and memory footprint of the final system. This in the end has a direct impact on the system cost and the battery lifetime of the system, i.e., the user experience. The MNEMEE project addresses the aforementioned challenges by introducing a novel tool flow that integrates several state-of-the-art source-to-source optimization methodologies and tools. It provides a methodology to automatically parallelize the source code of an application. It also optimizes the static and dynamic data structures in the source code such that they can efficiently use the memory hierarchy in an MPSoC. Finally, the tool flow maps the parallelized source code onto the processors and memories in an MPSoC. Many of the methodologies that are used in the tool flow are based on multi-objective exploration strategies. This allows designers to make design trade-offs and it makes product customization at design-time much easier. The MNEMEE tool flow provides a completely automated trajectory to map sequential applications onto an MPSoC while exploiting its memory hierarchy. The primary objective is to reduce the energy consumption and design-time of the new embedded system. Paul Lokuciejewski, Sascha Plazar, Heiko Falk, Peter Marwedel and Lothar Thiele.Approximating Pareto optimal compiler optimization sequences---a trade-off between WCET, ACET and code size. Software: Practice and Experience May 2011, DOI 10.1002/spe.1079[BibTeX][PDF][Abstract]@article { lokuciejewski:11:spe, author = {Lokuciejewski, Paul and Plazar, Sascha and Falk, Heiko and Marwedel, Peter and Thiele, Lothar}, title = {Approximating Pareto optimal compiler optimization sequences---a trade-off between WCET, ACET and code size}, journal = {Software: Practice and Experience}, year = {2011}, month = {may}, note = {DOI 10.1002/spe.1079}, keywords = {wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2011-spe.pdf}, confidential = {n}, abstract = {With the growing complexity of embedded systems software, high code quality can only be achieved using a compiler. Sophisticated compilers provide a vast spectrum of various optimizations to improve code aggressively w.\,r.\,t.~different objective functions, e.\,g., average-case execution time \textit{(ACET)} or code size. Due to the complex interactions between the optimizations, the choice for a promising sequence of code transformations is not trivial. Compiler developers address this problem by proposing standard optimization levels, e.\,g., \textit{O3} or \textit{Os}. However, previous studies have shown that these standard levels often miss optimization potential or might even result in performance degradation. In this paper, we propose the first adaptive WCET-aware compiler framework for an automatic search of compiler optimization sequences which yield highly optimized code. Besides the objective functions ACET and code size, we consider the worst-case execution time \textit{(WCET)} which is a crucial parameter for real-time systems. To find suitable trade-offs between these objectives, stochastic evolutionary multi-objective algorithms identifying Pareto optimal solutions for the objectives $\langle$WCET, ACET$\rangle$ and $\langle$WCET, code size$\rangle$ are exploited. A comparison based on statistical performance assessments is performed which helps to determine the most suitable multi-objective optimizer. The effectiveness of our approach is demonstrated on real-life benchmarks showing that standard optimization levels can be significantly outperformed.}, }With the growing complexity of embedded systems software, high code quality can only be achieved using a compiler. Sophisticated compilers provide a vast spectrum of various optimizations to improve code aggressively w. r. t. different objective functions, e. g., average-case execution time (ACET) or code size. Due to the complex interactions between the optimizations, the choice for a promising sequence of code transformations is not trivial. Compiler developers address this problem by proposing standard optimization levels, e. g., O3 or Os. However, previous studies have shown that these standard levels often miss optimization potential or might even result in performance degradation. In this paper, we propose the first adaptive WCET-aware compiler framework for an automatic search of compiler optimization sequences which yield highly optimized code. Besides the objective functions ACET and code size, we consider the worst-case execution time (WCET) which is a crucial parameter for real-time systems. To find suitable trade-offs between these objectives, stochastic evolutionary multi-objective algorithms identifying Pareto optimal solutions for the objectives and are exploited. A comparison based on statistical performance assessments is performed which helps to determine the most suitable multi-objective optimizer. The effectiveness of our approach is demonstrated on real-life benchmarks showing that standard optimization levels can be significantly outperformed. Heiko Falk and Paul Lokuciejewski.A compiler framework for the reduction of worst-case execution times. Journal on Real-Time Systems 46 2, pages 251-300 October 2010, DOI 10.1007/s11241-010-9101-x[BibTeX][PDF][Abstract]@article { falk:10:springer-rts, author = {Falk, Heiko and Lokuciejewski, Paul}, title = {A compiler framework for the reduction of worst-case execution times}, journal = {Journal on Real-Time Systems}, year = {2010}, volume = {46}, number = {2}, pages = {251-300}, month = {oct}, note = {DOI 10.1007/s11241-010-9101-x}, keywords = {wcet}, file = {http://vg09.met.vgwort.de/na/1fbe4260e3244c11b33e4c6d0ffa10e3?l=http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2010-rts.pdf}, confidential = {n}, abstract = {The current practice to design software for real-time systems is tedious. There is almost no tool support that assists the designer in automatically deriving safe bounds of the \textit{worst-case execution time (WCET)} of a system during code generation and in systematically optimizing code to reduce WCET. This article presents concepts and infrastructures for WCET-aware code generation and optimization techniques for WCET reduction. All together, they help to obtain code explicitly optimized for its worst-case timing, to automate large parts of the real-time software design flow, and to reduce costs of a real-time system by allowing to use tailored hardware.}, }The current practice to design software for real-time systems is tedious. There is almost no tool support that assists the designer in automatically deriving safe bounds of the worst-case execution time (WCET) of a system during code generation and in systematically optimizing code to reduce WCET. This article presents concepts and infrastructures for WCET-aware code generation and optimization techniques for WCET reduction. All together, they help to obtain code explicitly optimized for its worst-case timing, to automate large parts of the real-time software design flow, and to reduce costs of a real-time system by allowing to use tailored hardware. Frank Weichert, Marcel Gaspar, Constantin Timm, Alexander Zybin, Evgeny Gurevich, Michael Engel, Heinrich Müller and Peter Marwedel.Signal Analysis and Classification for Surface Plasmon Assisted Microscopy of Nanoobjects. Sensors and Actuators B: Chemical, Elsevier 151, pages 281-290 2010[BibTeX]@article { weichert:2010:sensoractuators, author = {Weichert, Frank and Gaspar, Marcel and Timm, Constantin and Zybin, Alexander and Gurevich, Evgeny and Engel, Michael and M{\"u}ller, Heinrich and Marwedel, Peter}, title = {Signal Analysis and Classification for Surface Plasmon Assisted Microscopy of Nanoobjects}, journal = {Sensors and Actuators B: Chemical, Elsevier}, year = {2010}, volume = {151}, pages = {281-290}, confidential = {n}, } Manish Verma, Lars Wehmeyer and Peter Marwedel.Cache-Aware Scratchpad-Allocation Algorithms for Energy-Constrained Embedded Systems. IEEE Trans. on CAD of Integrated Circuits and System (TCAD) 25 10, pages 2035--2051 2006[BibTeX][PDF][Abstract]@article { verma:06:cad, author = {Verma, Manish and Wehmeyer, Lars and Marwedel, Peter}, title = {Cache-Aware Scratchpad-Allocation Algorithms for Energy-Constrained Embedded Systems}, journal = {IEEE Trans. on CAD of Integrated Circuits and System (TCAD)}, year = {2006}, volume = {25}, number = {10}, pages = {2035--2051}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2006-tcad.pdf}, confidential = {n}, abstract = {In the context of mobile embedded devices, reducing energy is one of the prime objectives. Memories are responsible for a significant percentage of a system's aggregate energy consumption. Consequently, novel memories as well as novel-memory architectures are being designed to reduce the energy consumption. Caches and scratchpads are two contrasting memory architectures. The former relies on hardware logic while the latter relies on software for its utilization. To meet different requirements, most contemporary high-end embedded microprocessors include on-chip instruction and data caches along with a scratchpad. Previous approaches for utilizing scratchpad did not consider caches and hence fail for the contemporary high-end systems. Instructions are allocated onto the scratchpad, while taking into account the behavior of the instruction cache present in the system. The problem of scratchpad allocation is solved using a heuristic and also optimally using an integer linear programming formulation. An average reduction of 7\% and 23\% in processor cycles and instruction-memory energy, respectively, is reported when compared against a previously published technique. The average deviation between optimal and nonoptimal solutions was found to be less than 6\% both in terms of processor cycles and energy. The scratchpad in the presented architecture is similar to a preloaded loop cache. Comparing the energy consumption of the presented approach against that of a preloaded loop cache, an average reduction of 9\% and 29\% in processor cycles and instruction-memory energy, respectively, is reported.}, }In the context of mobile embedded devices, reducing energy is one of the prime objectives. Memories are responsible for a significant percentage of a system's aggregate energy consumption. Consequently, novel memories as well as novel-memory architectures are being designed to reduce the energy consumption. Caches and scratchpads are two contrasting memory architectures. The former relies on hardware logic while the latter relies on software for its utilization. To meet different requirements, most contemporary high-end embedded microprocessors include on-chip instruction and data caches along with a scratchpad. Previous approaches for utilizing scratchpad did not consider caches and hence fail for the contemporary high-end systems. Instructions are allocated onto the scratchpad, while taking into account the behavior of the instruction cache present in the system. The problem of scratchpad allocation is solved using a heuristic and also optimally using an integer linear programming formulation. An average reduction of 7% and 23% in processor cycles and instruction-memory energy, respectively, is reported when compared against a previously published technique. The average deviation between optimal and nonoptimal solutions was found to be less than 6% both in terms of processor cycles and energy. The scratchpad in the presented architecture is similar to a preloaded loop cache. Comparing the energy consumption of the presented approach against that of a preloaded loop cache, an average reduction of 9% and 29% in processor cycles and instruction-memory energy, respectively, is reported. Manish Verma and Peter Marwedel.Overlay Techniques for Scratchpad Memories in Low Power Embedded Processors. IEEE TVLSI 14 8 2006[BibTeX][PDF][Abstract]@article { verma:06:tvlsi, author = {Verma, Manish and Marwedel, Peter}, title = {Overlay Techniques for Scratchpad Memories in Low Power Embedded Processors}, journal = {IEEE TVLSI}, year = {2006}, volume = {14}, number = {8}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2006-tvlsi.pdf}, confidential = {n}, abstract = {Energy consumption is the one of the important parameters to be optimized during the design of portable embedded systems. Thus, most of the contemporary portable devices feature low-power processors coupled with on-chip memories (\textem{e.g.} caches, scratchpads). Scratchpads are better than traditional caches in terms of power, performance, area and predictability. However, unlike caches they depend upon software allocation techniques for their utilization. In this paper, we present scratchpad overlay techniques which analyze the application and insert instructions to dynamically copy both variables and code segments onto the scratchpad at runtime. We demonstrate that the problem of overlaying scratchpad is an extension of the Global Register Allocation problem. We present optimal and near-optimal approaches for solving the scratchpad overlay problem. The near-optimal scratchpad overlay approach achieves close to the optimal results and is significantly faster than the optimal approach. Our approaches improve upon the previously known static allocation technique for assigning both variables and code segments onto the scratchpad. The evaluation of the approaches for ARM7 processor reports average energy and execution time reductions of 26\% and 14\% over the static approach, respectively. Additional experiments comparing the overlayed scratchpads against unified caches of the same size, report average energy and execution time savings of 20\% and 10\%, respectively. We also report data memory energy reductions of 45\%-57\% due to the insertion of a 1024 bytes scratchpad memory in the memory hierarchy of a digital signal processor (DSP).}, }Energy consumption is the one of the important parameters to be optimized during the design of portable embedded systems. Thus, most of the contemporary portable devices feature low-power processors coupled with on-chip memories (e.g. caches, scratchpads). Scratchpads are better than traditional caches in terms of power, performance, area and predictability. However, unlike caches they depend upon software allocation techniques for their utilization. In this paper, we present scratchpad overlay techniques which analyze the application and insert instructions to dynamically copy both variables and code segments onto the scratchpad at runtime. We demonstrate that the problem of overlaying scratchpad is an extension of the Global Register Allocation problem. We present optimal and near-optimal approaches for solving the scratchpad overlay problem. The near-optimal scratchpad overlay approach achieves close to the optimal results and is significantly faster than the optimal approach. Our approaches improve upon the previously known static allocation technique for assigning both variables and code segments onto the scratchpad. The evaluation of the approaches for ARM7 processor reports average energy and execution time reductions of 26% and 14% over the static approach, respectively. Additional experiments comparing the overlayed scratchpads against unified caches of the same size, report average energy and execution time savings of 20% and 10%, respectively. We also report data memory energy reductions of 45%-57% due to the insertion of a 1024 bytes scratchpad memory in the memory hierarchy of a digital signal processor (DSP). Peter Marwedel and Birgit Sirocic.Interaktive Visualisierung dynamischer Vorgänge in Rechensystemen mittels Multimediatechnik. 2003[BibTeX][PDF]@article { marwedel:03, author = {Marwedel, Peter and Sirocic, Birgit}, title = {Interaktive Visualisierung dynamischer Vorg\"ange in Rechensystemen mittels Multimediatechnik}, year = {2003}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2003-inwida.pdf}, confidential = {n}, } Peter Marwedel (ed.).Guest Editor?s Introduction:Processor-Based Designs. IEEE Design and Test of Computers 2002[BibTeX][Abstract]@article { marwedel:02:journal, author = {Marwedel (ed.), Peter}, title = {Guest Editor?s Introduction:Processor-Based Designs}, journal = {IEEE Design and Test of Computers}, year = {2002}, confidential = {n}, abstract = {Welcome to this special issue on embedded systems, which are information processing systems embedded in larger products, such as cars, airplanes,and fabrication or communicationequipment. In most embedded systems, information processing is invisible to the user. The main reason for buying such a system is not information processing, but the system?s overall functionality. Embedded systems must also be efficient in terms of weight, cost, size, and energy consumption. Moreover, they must be dependable and in many cases must meet real time constraints. The market for embedded systems will grow significantly over the next years. For example, Kontron, one of the leading vendors of embedded computers, expects 40\% a annualgrowth over the next four years in the voice traffic network equipment market (http://www.icsadvent.com/applications/ap_convergence.cfm).}, }Welcome to this special issue on embedded systems, which are information processing systems embedded in larger products, such as cars, airplanes,and fabrication or communicationequipment. In most embedded systems, information processing is invisible to the user. The main reason for buying such a system is not information processing, but the system?s overall functionality. Embedded systems must also be efficient in terms of weight, cost, size, and energy consumption. Moreover, they must be dependable and in many cases must meet real time constraints. The market for embedded systems will grow significantly over the next years. For example, Kontron, one of the leading vendors of embedded computers, expects 40% a annualgrowth over the next four years in the voice traffic network equipment market (http://www.icsadvent.com/applications/ap_convergence.cfm). Peter Marwedel (ed.).Special Issue on Software and Compilers for Embedded Systems. IEEE TCAD 20 11 November 2001[BibTeX][PDF]@article { marw:01:tcad, author = {Marwedel (ed.), Peter}, title = {Special Issue on Software and Compilers for Embedded Systems}, journal = {IEEE TCAD}, year = {2001}, volume = {20}, number = {11}, month = {nov}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2001-tcad.pdf}, confidential = {n}, } Lars Wehmeyer, Manoj Kumar Jain, Stefan Steinke, Peter Marwedel and M. Balakrishnan.Analysis of the Influence of Register File Size on Energy Consumption, Code Size and Execution Time. IEEE TCAD 20 11 November 2001[BibTeX][PDF][Abstract]@article { wehm:01:conf, author = {Wehmeyer, Lars and Jain, Manoj Kumar and Steinke, Stefan and Marwedel, Peter and Balakrishnan, M.}, title = {Analysis of the Influence of Register File Size on Energy Consumption, Code Size and Execution Time}, journal = {IEEE TCAD}, year = {2001}, volume = {20}, number = {11}, month = {nov}, keywords = {ecc}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2001-tcad_LW.pdf}, confidential = {n}, abstract = {Interest in low power embedded systems has increased considerably in the past few years. To produce low power code and to allow an estimation of power consumption of software running on embedded systems, a power model was developed based on physical measurement using an evaluation board and integrated into a compiler and profiler. The compiler uses the power information to choose instruction sequences consuming less power, whereas the profiler gives information about the total power consumed during execution of the generated program. The used compiler is parameterized such that e.g. the register file size may be changed. The resulting code is evaluated with respect to code size, performance and power consumption for different register file sizes. The extracted information is especially useful during application analysis and architecture space exploration in ASIP design. Our analysis gives the designer the ability to estimate the desirable register file size for an ASIP design. The size of the register file should be considered as a design parameter since it has a strong impact on the energy consumption of embedded systems.}, }Interest in low power embedded systems has increased considerably in the past few years. To produce low power code and to allow an estimation of power consumption of software running on embedded systems, a power model was developed based on physical measurement using an evaluation board and integrated into a compiler and profiler. The compiler uses the power information to choose instruction sequences consuming less power, whereas the profiler gives information about the total power consumed during execution of the generated program. The used compiler is parameterized such that e.g. the register file size may be changed. The resulting code is evaluated with respect to code size, performance and power consumption for different register file sizes. The extracted information is especially useful during application analysis and architecture space exploration in ASIP design. Our analysis gives the designer the ability to estimate the desirable register file size for an ASIP design. The size of the register file should be considered as a design parameter since it has a strong impact on the energy consumption of embedded systems. Jens Wagner and Rainer Leupers.C Compiler Design for a Network Processor. IEEE TCAD 20 11 November 2001[BibTeX][PDF][Abstract]@article { wagner:01:tcad, author = {Wagner, Jens and Leupers, Rainer}, title = {C Compiler Design for a Network Processor}, journal = {IEEE TCAD}, year = {2001}, volume = {20}, number = {11}, month = {nov}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2001-tcad_JWRL.pdf}, confidential = {n}, abstract = {One important problem in code generation for embedded processors is the design of efficient compilers for target machines with application specific architectures. This paper outlines the design of a C compiler for an industrial application specific processor (ASIP) for telecom applications. The target ASIP is a network processor with special instructions for bit-level access to data registers, which is required for packet oriented communication protocol processing. From a practical viewpoint, we describe the main challenges in exploiting these application specific features in a C compiler, and we show how a compiler backend has been designed that accommodates these features by means of compiler intrinsics and a dedicated register allocator. The compiler is fully operational, and first experimental results indicate that C-level programming of the ASIP leads to good code quality without the need for time-consuming assembly programming.}, }One important problem in code generation for embedded processors is the design of efficient compilers for target machines with application specific architectures. This paper outlines the design of a C compiler for an industrial application specific processor (ASIP) for telecom applications. The target ASIP is a network processor with special instructions for bit-level access to data registers, which is required for packet oriented communication protocol processing. From a practical viewpoint, we describe the main challenges in exploiting these application specific features in a C compiler, and we show how a compiler backend has been designed that accommodates these features by means of compiler intrinsics and a dedicated register allocator. The compiler is fully operational, and first experimental results indicate that C-level programming of the ASIP leads to good code quality without the need for time-consuming assembly programming. Peter Marwedel.Eingebettete Systeme. Log in 6/2000, pages 16-18 2001[BibTeX][PDF]@article { marwedel:06:log, author = {Marwedel, Peter}, title = {Eingebettete Systeme}, journal = {Log in 6/2000}, year = {2001}, pages = {16-18}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/}, confidential = {n}, } Peter Marwedel.Compilers for Embedded Processors (in German). Informationstechnik und Technische Informatik (it+ti) 4, pages 194-199 June 2001[BibTeX][Abstract]@article { marwedel:01:itti, author = {Marwedel, Peter}, title = {Compilers for Embedded Processors (in German)}, journal = {Informationstechnik und Technische Informatik (it+ti)}, year = {2001}, volume = {4}, pages = {194-199}, month = {jun}, confidential = {n}, abstract = {Characteristics of embedded systems include an increasing use of software and the need for efficient realisations. As a consequence, techniques for generating efficient software are required. For DSP-, Multimedia- and VLIW-Processors, techniques for generating efficient assembly language programs from C hardly exist. This contribution describes new compiler techniques exploiting the properties of the processors just mentioned. With these techniques, it is possible to obtain a code quality which is comparable to that of manually generated assembly language programs.}, }Characteristics of embedded systems include an increasing use of software and the need for efficient realisations. As a consequence, techniques for generating efficient software are required. For DSP-, Multimedia- and VLIW-Processors, techniques for generating efficient assembly language programs from C hardly exist. This contribution describes new compiler techniques exploiting the properties of the processors just mentioned. With these techniques, it is possible to obtain a code quality which is comparable to that of manually generated assembly language programs. Stefan Steinke, Lars Wehmeyer and Peter Marwedel.Energieeinsparung durch neue Compiler-Optimierungen. Elektronik in 13/2001, pages p. 62-67 June 2001[BibTeX][Abstract]@article { steinke:01:elektr, author = {Steinke, Stefan and Wehmeyer, Lars and Marwedel, Peter}, title = {Energieeinsparung durch neue Compiler-Optimierungen}, journal = {Elektronik in 13/2001}, year = {2001}, pages = {p. 62-67}, month = {jun}, keywords = {ecc}, confidential = {n}, abstract = {Der Einsatz von C-Compilern verdr\"angt immer mehr die Assembler-Programmierung. Diese Tatsache kann man sich in Systemen zunutze machen, die auf niedrigen Energieverbrauch angewiesen sind. Energie-optimierende Compiler k\"onnen hier Assemblercode mit gleicher Laufzeit generieren, der weniger Energie verbraucht als der von zeitoptimierenden Compilern. Die vorgestellten Techniken k\"onnen aber auch ohne Einsatz eines Compilers dem Assembler-Programmierer M\"oglichkeiten des Energiesparens aufzeigen.}, }Der Einsatz von C-Compilern verdrängt immer mehr die Assembler-Programmierung. Diese Tatsache kann man sich in Systemen zunutze machen, die auf niedrigen Energieverbrauch angewiesen sind. Energie-optimierende Compiler können hier Assemblercode mit gleicher Laufzeit generieren, der weniger Energie verbraucht als der von zeitoptimierenden Compilern. Die vorgestellten Techniken können aber auch ohne Einsatz eines Compilers dem Assembler-Programmierer Möglichkeiten des Energiesparens aufzeigen. Rainer Leupers.LANCE: A C Compiler Platform for Embedded Processors. Embedded Systems/Embedded Intelligence February 2001[BibTeX][PDF][Abstract]@article { leupers:06:journal, author = {Leupers, Rainer}, title = {LANCE: A C Compiler Platform for Embedded Processors}, journal = {Embedded Systems/Embedded Intelligence}, year = {2001}, month = {feb}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2001-es.pdf}, confidential = {n}, abstract = {This paper describes LANCE, a software system for development of C compilers for embedded processors. LANCE comprises an ANSI C frontend, a collection of machine-independent code optimization tools, a C++ library for accessing and manipulating the intermediate program representation, as well as a backend interface for assembly code generators. The backend interface is compatible to standard code generator generator tools and therefore allows for compiler development for application-specific embedded processors at a relatively low implementation effort. LANCE is mainly intended to facilitate C compiler design for embedded processors, so as to eliminate the need for time-consuming assembly programming. Embedded processors for which LANCE based C compilers have been successfully built include both RISCs and DSPs. Initially designed for research purposes only, LANCE is now also being used for production-quality compiler development. Due to its clear tool structure, simple intermediate program representation, and machine independence, LANCE is particularly suitable for fast compiler development for new application-specific processors.}, }This paper describes LANCE, a software system for development of C compilers for embedded processors. LANCE comprises an ANSI C frontend, a collection of machine-independent code optimization tools, a C++ library for accessing and manipulating the intermediate program representation, as well as a backend interface for assembly code generators. The backend interface is compatible to standard code generator generator tools and therefore allows for compiler development for application-specific embedded processors at a relatively low implementation effort. LANCE is mainly intended to facilitate C compiler design for embedded processors, so as to eliminate the need for time-consuming assembly programming. Embedded processors for which LANCE based C compilers have been successfully built include both RISCs and DSPs. Initially designed for research purposes only, LANCE is now also being used for production-quality compiler development. Due to its clear tool structure, simple intermediate program representation, and machine independence, LANCE is particularly suitable for fast compiler development for new application-specific processors. Peter Marwedel and Ivo Bolsens (ed.).Proceedings of DATE'2000 (Design, Automation and Test in Europe). IEEE Computer Society 2000, Electronically available through ACM (http://www.acm.org/pubs/contents/proceedings/series/date/[BibTeX][PDF]@article { marw:00:date, author = {Marwedel, Peter and Bolsens (ed.), Ivo}, title = {Proceedings of DATE'2000 (Design, Automation and Test in Europe)}, journal = {IEEE Computer Society}, year = {2000}, note = {Electronically available through ACM (http://www.acm.org/pubs/contents/proceedings/series/date/}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/}, confidential = {n}, } S. S. Bhattacharyya, Rainer Leupers and Peter Marwedel.Software Synthesis and Code Generation for Signal Processing Systems. IEEE Trans. on Circuits and Systems II --- Analog and Digital Signal Processing 47 9 2000[BibTeX][PDF][Abstract]@article { bhatt:00:tcas, author = {Bhattacharyya, S. S. and Leupers, Rainer and Marwedel, Peter}, title = {Software Synthesis and Code Generation for Signal Processing Systems}, journal = {IEEE Trans. on Circuits and Systems II --- Analog and Digital Signal Processing}, year = {2000}, volume = {47}, number = {9}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2000-ieee-tcas.pdf}, confidential = {n}, abstract = {The role of software is becoming increasingly important in the implementation of DSP applications. As this trend intensifies, and the complexity of applications escalates, we are seeing an increased need for automated tools to aid in the development of DSP software. This paper reviews the state of the art in programming language and compiler technology for DSP software implementation. In particular, we review techniques for high level, block-diagram-based modeling of DSP applications; the translation of block diagram specifications into efficient C programs using global, target-independent optimization techniques; and the compilation of C programs into streamlined machine code for programmable DSP processors, using architecture-specific and retargetable back-end optimizations. We also point out important directions for further investigation.}, }The role of software is becoming increasingly important in the implementation of DSP applications. As this trend intensifies, and the complexity of applications escalates, we are seeing an increased need for automated tools to aid in the development of DSP software. This paper reviews the state of the art in programming language and compiler technology for DSP software implementation. In particular, we review techniques for high level, block-diagram-based modeling of DSP applications; the translation of block diagram specifications into efficient C programs using global, target-independent optimization techniques; and the compilation of C programs into streamlined machine code for programmable DSP processors, using architecture-specific and retargetable back-end optimizations. We also point out important directions for further investigation. Peter Marwedel.Special issue on SCOPES'99. TODAES 5 4 October 2000[BibTeX]@article { marw:00:todaes, author = {Marwedel, Peter}, title = {Special issue on SCOPES'99}, journal = {TODAES}, year = {2000}, volume = {5}, number = {4}, month = {oct}, confidential = {n}, } Peter Marwedel (ed.).Special Issue on Design Automation for Embedded Systems. ACM TODAES 5 4 2000[BibTeX][PDF][Abstract]@article { marwedel:00:journal, author = {Marwedel (ed.), Peter}, title = {Special Issue on Design Automation for Embedded Systems}, journal = {ACM TODAES}, year = {2000}, volume = {5}, number = {4}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/}, confidential = {n}, abstract = {(Draft version of the editorial; the layout of the final version is different): Editorial: As the trend towards comprehensive use of electronic information processing continues, more and more so-called embedded systems are being designed and used. Examples of such systems include mobile telecommunication devices, so-called information appliances, automotive electronic equipment and information technology equipment in smart homes. These systems have a number of characteristics in common. Embedded systems typically meet the majority of the following criteria: \begin{itemize} \item they have to meet hard deadlines for their response time, \item they do not come with a keyboard, a large screen and a mouse; \item they hide information processing from the user, \item they have to be cost-, area-, weight- and/or power-efficient, \item they have to be dependable. \end{itemize} In order to provide the required flexibility, more and more embedded systems are software-based. The generation of embedded software requires new software generation techniques that take the special characteristics of embedded systems into account. One of these characteristics is the use of processors optimized for certain application domains or even for certain applications. The main motivation for specialized processors is the need to provide efficient solutions. As an example, processors for digital signal processing (DSP) frequently provide \begin{itemize} \item specialized multiply/accumulate instructions, \item saturating arithmetic, \item heterogenous register sets, \item specialized addressing modes, \item limited parallelism (more recently also very long instruction word (VLIW)-type of parallelism). \end{itemize} If these features are not exploited in compilers, inefficient code is the result and designers have to use assembly languages. In order to get around this uncomfortable situation, the design of specialized compilation techniques for embedded processors has started. General problems of embedded software also include the need for good specification languages, for fast simulation, for verification, for interprocess communication and for interfaces to real-time operating systems, to name just a few. Stimulated by the CHIPS project on compilation for embedded processors, a first European workshop on compilers for embedded processors was held at Schloss Dagstuhl, Germany, in 1994. Due to its success, it was followed by a series of similar workshops. The title of the first three workshops was Workshop on code generation for embedded processors. From the fourth workshop onwards, the scope also included software generation for embedded processors in general. Hence, the acronym SCOPES (software and code generation for embedded processors) was used from the fourth workshop onwards. The dates and locations of the workshop are as follows: Workshop ; Location ; Dates \\ 1 ; Schloss Dagstuhl, Wadern, Germany ; Aug. 31st to Sept. 2nd, 1994 \\ 2 ; Leuven, Belgium ; March 18th to March 20th, 1996 \\ 3 ; Witten, Germany ; March 4th to March 6th, 1998 \\ 4 ; St. Goar, Germany ; Sept. 1st to Sept. 3rd, 1999 \\ 5 ; St. Goar, Germany ; March 20th to 22nd, 2001 The fourth workshop was held at a very scenic castle hotel called Schloss Rheinfels at St. Goar, Germany. The view from the conference room onto the river Rhine and its valleys was so nice that it was decided to hold the fifth workshop at the same place again. One of the main goals of the workshop is to stimulate the discussion between participants. Therefore, attendance was so far restricted to groups of people who had already worked in the area. This goal has been achieved very nicely so far. Another characteristic is that the workshop does not try to compete with well-established publication channels, such as conferences and journals. Rather, the best contributions at the workshop are considered being candidates for a special publication. As a result of the first workshop, the book Code generation for embedded processors'' (edited by Gert Goossens and myself) was published by Kluwer. Unfortunately, time constraints prevented publishing papers from the second workshop. The best papers from the third workshop were published in the April 1999 issue of Design Automation for Embedded Systems. The current special issue includes the best papers from the fourth workshop. They have been reviewed by an excellent panel of international reviewers consisting of A. Nicolau (Irvine), S. Malik (Princeton), J. van Meerbergen (Eindhoven), B. Wess (Vienna) and R. Wilhelm (Saarbr\"ucken). The first paper is entitled Code minimization and retargetable assembly for custom EPIC and VLIW instruction formats''. It was written by Shail Aditya, Scott Mahlke and B. Ramakrishna Rau of Hewlett Packard. They describe the PICO system, a system for automatically designing and programming very long instruction word (VLIW) architectures. They focus on techniques for generating compact code for their architectures. Some of these techniques are applied during architecture design while others are applied during program generation. With the current trend towards VLIW architectures, this paper is of potential interest to a major number of designers working on these architectures. The title of the second paper is Constraint analysis for code generation: basic techniques and applications in FACTS''. The authors are: Koen van Eijk, Bart Mesman, Carlos A. Alba Pinto , Qin Zhao, Marco Bekooij, Jef van Meerbergen and Jochen Jess. The work is a result of the cooperation between Philips Research Laboratories and Eindhoven University, Netherlands. The work is concerned with code generation for embedded processors. Most of these processors are capable of performing a number of operations in parallel. Usually, a separate component of a compiler, called scheduler, is responsible of deciding the order in which operations of a given source program will be executed on the processor. Due to complexity reasons, this scheduler has only a very limited interaction with other compiler components. As a result, schedulers sometimes take decisions which turn out to be poor as soon as more information (e.g. information about registers) is considered. The key advantage of FACTS is that it considers information about hardware resources (such as registers) already during scheduling. Due to this coupling, better code quality can be achieved. The proposed technique can be a valuable enhancement to many scheduling algorithms. The third paper is called Graph based code selection techniques for embedded processors'' by Rainer Leupers and Steven Bashford. Both are with the University of Dortmund, Germany. One of the essential functions of any compiler is to select machine instructions for implementing operations specified in the source program. In general, graphs would be required to represent the flow of values between these operations. For common subexpressions in the source program, these graphs describe the flow of the value represented by the the subexpression to all operations needing that value. Unfortunately, it has been found that the optimal selection of machine instructions implementing graphs is computationally very expensive (NP-hard). It is therefore usually avoided by splitting graphs into trees and selecting machine instructions for trees. This approach is computationally efficient but leads to suboptimal code. Leupers and Bashford describe techniques for graph-based machine instruction selection that lead to better code than tree-based techniques but which are still computationally efficient. These techniques are also capable of exploiting subword parallelism in the form of multimedia instructions. One very important aspect of embedded software design is that of fast simulation of such software. This is required especially during the prototyping stage. During prototyping, it is very common that large amounts of data have to be simulated. Therefore, extremely fast simulation is needed. One approach for obtaining a large simulation speed is described by the authors of the fourth paper. The paper is entitled Retargetable Compiled Simulation of Embedded Processors Using a Machine Description Language''. Its authors are Stefan Pees, Andreas Hoffmann and Heinrich Meyr of the Technical University (RWTH) at Aachen, Germany. The key towards high simulation speeds is compiled simulation. Compiled simulation has already been used for hardware description languages and is now used for simulating digital signal processing (DSP) applications fast, using the retargetable simulator LISA. LISA is between 37x and 170x faster than a commercial simulator. The proposed techniques are expected to be very important for simulating embedded software in general. I hope that this special issue will be one of your key references for work on embedded software generation and simulation. Please enjoy reading the special issue! Dortmund, May 2000 Peter Marwedel \\ Guest Editor}, }(Draft version of the editorial; the layout of the final version is different): Editorial: As the trend towards comprehensive use of electronic information processing continues, more and more so-called embedded systems are being designed and used. Examples of such systems include mobile telecommunication devices, so-called information appliances, automotive electronic equipment and information technology equipment in smart homes. These systems have a number of characteristics in common. Embedded systems typically meet the majority of the following criteria: itemize \item they have to meet hard deadlines for their response time, \item they do not come with a keyboard, a large screen and a mouse; \item they hide information processing from the user, \item they have to be cost-, area-, weight- and/or power-efficient, \item they have to be dependable. itemize In order to provide the required flexibility, more and more embedded systems are software-based. The generation of embedded software requires new software generation techniques that take the special characteristics of embedded systems into account. One of these characteristics is the use of processors optimized for certain application domains or even for certain applications. The main motivation for specialized processors is the need to provide efficient solutions. As an example, processors for digital signal processing (DSP) frequently provide itemize \item specialized multiply/accumulate instructions, \item saturating arithmetic, \item heterogenous register sets, \item specialized addressing modes, \item limited parallelism (more recently also very long instruction word (VLIW)-type of parallelism). itemize If these features are not exploited in compilers, inefficient code is the result and designers have to use assembly languages. In order to get around this uncomfortable situation, the design of specialized compilation techniques for embedded processors has started. General problems of embedded software also include the need for good specification languages, for fast simulation, for verification, for interprocess communication and for interfaces to real-time operating systems, to name just a few. Stimulated by the CHIPS project on compilation for embedded processors, a first European workshop on compilers for embedded processors was held at Schloss Dagstuhl, Germany, in 1994. Due to its success, it was followed by a series of similar workshops. The title of the first three workshops was Workshop on code generation for embedded processors. From the fourth workshop onwards, the scope also included software generation for embedded processors in general. Hence, the acronym SCOPES (software and code generation for embedded processors) was used from the fourth workshop onwards. The dates and locations of the workshop are as follows: Workshop ; Location ; Dates \ 1 ; Schloss Dagstuhl, Wadern, Germany ; Aug. 31st to Sept. 2nd, 1994 \ 2 ; Leuven, Belgium ; March 18th to March 20th, 1996 \ 3 ; Witten, Germany ; March 4th to March 6th, 1998 \ 4 ; St. Goar, Germany ; Sept. 1st to Sept. 3rd, 1999 \ 5 ; St. Goar, Germany ; March 20th to 22nd, 2001 The fourth workshop was held at a very scenic castle hotel called Schloss Rheinfels at St. Goar, Germany. The view from the conference room onto the river Rhine and its valleys was so nice that it was decided to hold the fifth workshop at the same place again. One of the main goals of the workshop is to stimulate the discussion between participants. Therefore, attendance was so far restricted to groups of people who had already worked in the area. This goal has been achieved very nicely so far. Another characteristic is that the workshop does not try to compete with well-established publication channels, such as conferences and journals. Rather, the best contributions at the workshop are considered being candidates for a special publication. As a result of the first workshop, the book "Code generation for embedded processors" (edited by Gert Goossens and myself) was published by Kluwer. Unfortunately, time constraints prevented publishing papers from the second workshop. The best papers from the third workshop were published in the April 1999 issue of Design Automation for Embedded Systems. The current special issue includes the best papers from the fourth workshop. They have been reviewed by an excellent panel of international reviewers consisting of A. Nicolau (Irvine), S. Malik (Princeton), J. van Meerbergen (Eindhoven), B. Wess (Vienna) and R. Wilhelm (Saarbrücken). The first paper is entitled "Code minimization and retargetable assembly for custom EPIC and VLIW instruction formats". It was written by Shail Aditya, Scott Mahlke and B. Ramakrishna Rau of Hewlett Packard. They describe the PICO system, a system for automatically designing and programming very long instruction word (VLIW) architectures. They focus on techniques for generating compact code for their architectures. Some of these techniques are applied during architecture design while others are applied during program generation. With the current trend towards VLIW architectures, this paper is of potential interest to a major number of designers working on these architectures. The title of the second paper is "Constraint analysis for code generation: basic techniques and applications in FACTS". The authors are: Koen van Eijk, Bart Mesman, Carlos A. Alba Pinto , Qin Zhao, Marco Bekooij, Jef van Meerbergen and Jochen Jess. The work is a result of the cooperation between Philips Research Laboratories and Eindhoven University, Netherlands. The work is concerned with code generation for embedded processors. Most of these processors are capable of performing a number of operations in parallel. Usually, a separate component of a compiler, called scheduler, is responsible of deciding the order in which operations of a given source program will be executed on the processor. Due to complexity reasons, this scheduler has only a very limited interaction with other compiler components. As a result, schedulers sometimes take decisions which turn out to be poor as soon as more information (e.g. information about registers) is considered. The key advantage of FACTS is that it considers information about hardware resources (such as registers) already during scheduling. Due to this coupling, better code quality can be achieved. The proposed technique can be a valuable enhancement to many scheduling algorithms. The third paper is called "Graph based code selection techniques for embedded processors" by Rainer Leupers and Steven Bashford. Both are with the University of Dortmund, Germany. One of the essential functions of any compiler is to select machine instructions for implementing operations specified in the source program. In general, graphs would be required to represent the flow of values between these operations. For common subexpressions in the source program, these graphs describe the flow of the value represented by the the subexpression to all operations needing that value. Unfortunately, it has been found that the optimal selection of machine instructions implementing graphs is computationally very expensive (NP-hard). It is therefore usually avoided by splitting graphs into trees and selecting machine instructions for trees. This approach is computationally efficient but leads to suboptimal code. Leupers and Bashford describe techniques for graph-based machine instruction selection that lead to better code than tree-based techniques but which are still computationally efficient. These techniques are also capable of exploiting subword parallelism in the form of multimedia instructions. One very important aspect of embedded software design is that of fast simulation of such software. This is required especially during the prototyping stage. During prototyping, it is very common that large amounts of data have to be simulated. Therefore, extremely fast simulation is needed. One approach for obtaining a large simulation speed is described by the authors of the fourth paper. The paper is entitled "Retargetable Compiled Simulation of Embedded Processors Using a Machine Description Language". Its authors are Stefan Pees, Andreas Hoffmann and Heinrich Meyr of the Technical University (RWTH) at Aachen, Germany. The key towards high simulation speeds is compiled simulation. Compiled simulation has already been used for hardware description languages and is now used for simulating digital signal processing (DSP) applications fast, using the retargetable simulator LISA. LISA is between 37x and 170x faster than a commercial simulator. The proposed techniques are expected to be very important for simulating embedded software in general. I hope that this special issue will be one of your key references for work on embedded software generation and simulation. Please enjoy reading the special issue! Dortmund, May 2000 Peter Marwedel \ Guest Editor Rainer Leupers.C-Compiler für Embedded Systems. Elektronik 19/2000 2000[BibTeX][PDF][Abstract]@article { leupers:00:elektronik, author = {Leupers, Rainer}, title = {C-Compiler f\"ur Embedded Systems}, journal = {Elektronik 19/2000}, year = {2000}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2000-elektronik.pdf}, confidential = {n}, abstract = {F\"ur Embedded Systems setzen sich C-Compiler zur Softwareentwicklung gegen\"uber der traditionellen Assemblerprogrammierung immer mehr durch. Allerdings werden oft Spezialprozessoren eingesetzt, f\"ur die keine C-Compiler verf\"ugbar sind. Da der Entwicklungsaufwand f\"ur neue Compiler hoch ist, sollten existierende Tools ausgenutzt werden. Dieser Artikel nennt wichtige Anforderungen an C-Compiler und gibt eine \"Ubersicht verf\"ugbarer Techniken und Tools.}, }Für Embedded Systems setzen sich C-Compiler zur Softwareentwicklung gegenüber der traditionellen Assemblerprogrammierung immer mehr durch. Allerdings werden oft Spezialprozessoren eingesetzt, für die keine C-Compiler verfügbar sind. Da der Entwicklungsaufwand für neue Compiler hoch ist, sollten existierende Tools ausgenutzt werden. Dieser Artikel nennt wichtige Anforderungen an C-Compiler und gibt eine \"Ubersicht verfügbarer Techniken und Tools. Peter Marwedel.Special issue on SCOPES. Design Automation for Embedded Systems 1999[BibTeX]@article { marw:99:scopes, author = {Marwedel, Peter}, title = {Special issue on SCOPES}, journal = {Design Automation for Embedded Systems}, year = {1999}, confidential = {n}, } Stefan Steinke.Programmierung von KFZ-Steuergeraeten. Elektronik Industrie 1999[BibTeX][PDF][Abstract]@article { steinke:1999:ei, author = {Steinke, Stefan}, title = {Programmierung von KFZ-Steuergeraeten}, journal = {Elektronik Industrie}, year = {1999}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/}, confidential = {n}, abstract = {Der steigende Anteil von Software im Kfz verlagert Entwicklungsaufwendungen und ver\"andert die Entwicklungsprozesse. Den k\"urzer werdenden Entwicklungszyklen und den wachsenden Softwareanforderungen kann nur durch eine Optimierung des Entwicklungsprozesses begegnet werden. Es werden der heutige Stand der industriellen Praxis aufgezeigt und M\"oglichkeiten beschrieben, den Entwicklungsproze\"s zu verbessern, und ein Ausblick auf zuk\"unftige Trends gegeben.}, }Der steigende Anteil von Software im Kfz verlagert Entwicklungsaufwendungen und verändert die Entwicklungsprozesse. Den kürzer werdenden Entwicklungszyklen und den wachsenden Softwareanforderungen kann nur durch eine Optimierung des Entwicklungsprozesses begegnet werden. Es werden der heutige Stand der industriellen Praxis aufgezeigt und Möglichkeiten beschrieben, den Entwicklungsprozeß zu verbessern, und ein Ausblick auf zukünftige Trends gegeben. Rainer Leupers Steven Bashford.Phase-Coupled Mapping of Data Flow Graphs to Irregular Data Paths. Design Automation for Embedded Systems 4 2/3 1999[BibTeX][PDF][Abstract]@article { bashford:1999:daes, author = {Steven Bashford, Rainer Leupers}, title = {Phase-Coupled Mapping of Data Flow Graphs to Irregular Data Paths}, journal = {Design Automation for Embedded Systems}, year = {1999}, volume = {4}, number = {2/3}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1999-daes.pdf}, confidential = {n}, abstract = {Many software compilers for embedded processors produce machine code of insufficient quality. Since for most applications software must meet tight code speed and size constraints, embedded software is still largely developed in assembly language. In order to eliminate this bottleneck and to enable the use of high-level language compilers also for embedded software, new code generation and optimization techniques are required. This paper describes a novel code generation technique for embedded processors with irregular data path architectures, such as typically found in fixed-point DSPs.The proposed code generation technique maps data flow graph representation of a program into highly efficient machine code for a target processor modeled by instruction set behavior. High code quality is ensured by tight coupling of different code generation phases. In contrast to earlier works, mainly based on heuristics, our approach is constraint-based. An initial set of constraints on code generation are prescribed by the given processor model. Further constraints arise during code generation based on decisions concerning code selection, register allocation, and scheduling. Whenever possible, decisions are postponed until sufficient information about a good decision has been collected. The constraints are active in the "background" and guarantee local satisfiability at any point of time during code generation. This mechanism permits to simultaneously cope with special-purpose registers and instruction level parallelism. We describe the detailed integration of code generation phases. The implementation is based on the constraint logic programming (CLP) language ECLiPSe. For a standard DSP, we show that the quality of generated code comes close to hand-written assembly code. Since the input processor model can be edited by the user, also retargetability of the code generation technique is achieved within a certain processor class.}, }Many software compilers for embedded processors produce machine code of insufficient quality. Since for most applications software must meet tight code speed and size constraints, embedded software is still largely developed in assembly language. In order to eliminate this bottleneck and to enable the use of high-level language compilers also for embedded software, new code generation and optimization techniques are required. This paper describes a novel code generation technique for embedded processors with irregular data path architectures, such as typically found in fixed-point DSPs.The proposed code generation technique maps data flow graph representation of a program into highly efficient machine code for a target processor modeled by instruction set behavior. High code quality is ensured by tight coupling of different code generation phases. In contrast to earlier works, mainly based on heuristics, our approach is constraint-based. An initial set of constraints on code generation are prescribed by the given processor model. Further constraints arise during code generation based on decisions concerning code selection, register allocation, and scheduling. Whenever possible, decisions are postponed until sufficient information about a good decision has been collected. The constraints are active in the "background" and guarantee local satisfiability at any point of time during code generation. This mechanism permits to simultaneously cope with special-purpose registers and instruction level parallelism. We describe the detailed integration of code generation phases. The implementation is based on the constraint logic programming (CLP) language ECLiPSe. For a standard DSP, we show that the quality of generated code comes close to hand-written assembly code. Since the input processor model can be edited by the user, also retargetability of the code generation technique is achieved within a certain processor class. Rainer Leupers.Schneller Code statt schnelle Compiler. Elektronik 22 1999[BibTeX][PDF][Abstract]@article { leupers:1999:elektronik, author = {Leupers, Rainer}, title = {Schneller Code statt schnelle Compiler}, journal = {Elektronik}, year = {1999}, volume = {22}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1999-elektronik.pdf}, confidential = {n}, abstract = {DSPs werden heute noch zeitaufwendig in Assembler programmiert, da die verf"ugbaren C-Compiler relativ schlechten Code generieren. Die Hauptursache hierf"ur sind komplexe, anwendungsspezifische Befehlss"atze, welche die Erzeugung von effizientem Maschinencode f"ur die Compiler stark erschweren. Dieser Artikel beschreibt Wege und Techniken zur Produktivit"atssteigerung in der DSP-Softwareentwicklung mittels innovativer Compiler-Optimierungstechniken.}, }DSPs werden heute noch zeitaufwendig in Assembler programmiert, da die verf"ugbaren C-Compiler relativ schlechten Code generieren. Die Hauptursache hierf"ur sind komplexe, anwendungsspezifische Befehlss"atze, welche die Erzeugung von effizientem Maschinencode f"ur die Compiler stark erschweren. Dieser Artikel beschreibt Wege und Techniken zur Produktivit"atssteigerung in der DSP-Softwareentwicklung mittels innovativer Compiler-Optimierungstechniken. Jean Mermet, Peter Marwedel, Franz J. Ramming, Cleland Newton, Dominique Borrione and Claude Lefaou.Three Decades of Hardware Description Languages in Europe. Journal of Electrical Engineering and Information Science 3 6 1998[BibTeX][PDF][Abstract]@article { mermet:1998:eeis, author = {Mermet, Jean and Marwedel, Peter and Ramming, Franz J. and Newton, Cleland and Borrione, Dominique and Lefaou, Claude}, title = {Three Decades of Hardware Description Languages in Europe}, journal = {Journal of Electrical Engineering and Information Science}, year = {1998}, volume = {3}, number = {6}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1998-eeis.zip}, confidential = {n}, abstract = {This paper binds together a collection of short presentations on Hardware Description Languages (HDLs) developed in Europe and provides a view of the history of HDLs during the last three decades. This historical review wants to present the ideas, conceived in these previous languages , which are now implemented in the standard languages. Furthermore, this paper will highlight those early concects which yet need to be implemented in the evolving standards or could provide a way to unify them (like VHDL or Verilog or SDL) within a formally defined multi-language environment. Among a large number of European works over three decades, we have selected a sample from different countries France, Germany, U.K., Italy, which have been implemented and used reliably in various segments of the industry. The selected HDLs, with the date of origination, are: CASSANDRE (1967), MIMOLA (1977), DACAPO (1979), ELLA (1979), ART (1980), and CASCADE (1981). We do not pretend to any exhaustive review, which is not the goal of this presentation, and have consciously left aside several works as valuable as those selected. We have not addressed for example "synchronous languages" very well developed in France, such as ESTEREL, LUSTRE or SIGNAL. Several other works existed in Germany, such as KARL, which was popular in the eighties, and benefits from a large bibliography or REGLAN. We should mention also among those HDLs not presented here CONLAN (a major international standardization effort involving a notable European contribution). We have tried to compare the main features of the chosen languages according to a list of criteria and briefly identify those which are still missing in the recognized worldwide standards.}, }This paper binds together a collection of short presentations on Hardware Description Languages (HDLs) developed in Europe and provides a view of the history of HDLs during the last three decades. This historical review wants to present the ideas, conceived in these previous languages , which are now implemented in the standard languages. Furthermore, this paper will highlight those early concects which yet need to be implemented in the evolving standards or could provide a way to unify them (like VHDL or Verilog or SDL) within a formally defined multi-language environment. Among a large number of European works over three decades, we have selected a sample from different countries France, Germany, U.K., Italy, which have been implemented and used reliably in various segments of the industry. The selected HDLs, with the date of origination, are: CASSANDRE (1967), MIMOLA (1977), DACAPO (1979), ELLA (1979), ART (1980), and CASCADE (1981). We do not pretend to any exhaustive review, which is not the goal of this presentation, and have consciously left aside several works as valuable as those selected. We have not addressed for example "synchronous languages" very well developed in France, such as ESTEREL, LUSTRE or SIGNAL. Several other works existed in Germany, such as KARL, which was popular in the eighties, and benefits from a large bibliography or REGLAN. We should mention also among those HDLs not presented here CONLAN (a major international standardization effort involving a notable European contribution). We have tried to compare the main features of the chosen languages according to a list of criteria and briefly identify those which are still missing in the recognized worldwide standards. Rainer Leupers and Peter Marwedel.Retargetable Code Generation based on Structural Processor Descriptions. Design Automation for Embedded Systems 3 1 1998[BibTeX][PDF][Abstract]@article { leupers:1998:daes, author = {Leupers, Rainer and Marwedel, Peter}, title = {Retargetable Code Generation based on Structural Processor Descriptions}, journal = {Design Automation for Embedded Systems}, year = {1998}, volume = {3}, number = {1}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1998-daes.pdf}, confidential = {n}, abstract = {Design automation for embedded systems comprising both hardware and software components demands for code generators integrated into electronic CAD systems. These code generators provide the necessary link between software synthesis tools in HW/SW codesign systems and embedded processors. General-purpose compilers for standard processors are often insufficient, because they do not provide flexibility with respect to different target processors and also suffer from inferior code quality. While recent research on code generation for embedded processors has primarily focussed on code quality issues, in this contribution we emphasize the importance of retargetability, and we describe an approach to achieve retargetability. We propose usage of uniform, external target processor models in code generation, which describe embedded processors by means of RT-level netlists. Such structural models incorporate more hardware details than purely behavioral models, thereby permitting a close link to hardware design tools and fast adaptation to different target processors. The MSSQ compiler, which is part of the MIMOLA hardware design system, operates on structural models. We describe input formats, central data structures, and code generation techniques in MSSQ. The compiler has been successfully retargeted to a number of real-life processors, which proves feasibility of our approach with respect to retargetability. We discuss capabilities and limitations of MSSQ, and identify possible areas of improvement.}, }Design automation for embedded systems comprising both hardware and software components demands for code generators integrated into electronic CAD systems. These code generators provide the necessary link between software synthesis tools in HW/SW codesign systems and embedded processors. General-purpose compilers for standard processors are often insufficient, because they do not provide flexibility with respect to different target processors and also suffer from inferior code quality. While recent research on code generation for embedded processors has primarily focussed on code quality issues, in this contribution we emphasize the importance of retargetability, and we describe an approach to achieve retargetability. We propose usage of uniform, external target processor models in code generation, which describe embedded processors by means of RT-level netlists. Such structural models incorporate more hardware details than purely behavioral models, thereby permitting a close link to hardware design tools and fast adaptation to different target processors. The MSSQ compiler, which is part of the MIMOLA hardware design system, operates on structural models. We describe input formats, central data structures, and code generation techniques in MSSQ. The compiler has been successfully retargeted to a number of real-life processors, which proves feasibility of our approach with respect to retargetability. We discuss capabilities and limitations of MSSQ, and identify possible areas of improvement. Peter Marwedel and C. Lopez-Barrio.Special issue on ED&TC. IEEE Design & Test June 1997[BibTeX]@article { marw:97:edtc, author = {Marwedel, Peter and Lopez-Barrio, C.}, title = {Special issue on ED\&TC}, journal = {IEEE Design \& Test}, year = {1997}, month = {June}, confidential = {n}, } Ralf Niemann and Peter Marwedel.An Algorithm for Hardware/Software Partitioning Using Mixed Integer Linear Programming. Design Automation for Embedded Systems 1997[BibTeX][PDF][Abstract]@article { niemann:1997:daes, author = {Niemann, Ralf and Marwedel, Peter}, title = {An Algorithm for Hardware/Software Partitioning Using Mixed Integer Linear Programming}, journal = {Design Automation for Embedded Systems}, year = {1997}, keywords = {hwsw}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1997-daes-journal.pdf}, confidential = {n}, abstract = {One of the key problems in hardware/software codesign is hardware/software partitioning. This paper describes a new approach to hardware/software partitioning using integer programming (IP). The advantage of using IP is that optimal results are calculated for a chosen objective function. The partitioning approach works fully automatic and supports multi-processor systems, interfacing and hardware sharing. In contrast to other approaches where special estimators are used, we use compilation and synthesis tools for cost estimation. The increased time for calculating values for the cost metrics is compensated by an improved quality of the values. Therefore, fewer iteration steps for partitioning are needed. The paper presents an algorithm using integer programming for solving the hardware/software partitioning problem leading to promising results.}, }One of the key problems in hardware/software codesign is hardware/software partitioning. This paper describes a new approach to hardware/software partitioning using integer programming (IP). The advantage of using IP is that optimal results are calculated for a chosen objective function. The partitioning approach works fully automatic and supports multi-processor systems, interfacing and hardware sharing. In contrast to other approaches where special estimators are used, we use compilation and synthesis tools for cost estimation. The increased time for calculating values for the cost metrics is compensated by an improved quality of the values. Therefore, fewer iteration steps for partitioning are needed. The paper presents an algorithm using integer programming for solving the hardware/software partitioning problem leading to promising results. Rainer Leupers and Peter Marwedel.Time-Constrained Code Compaction for DSPs. IEEE Trans. on VLSI Systems 5 1 1997[BibTeX][PDF][Abstract]@article { leupers:1997:tvlsi, author = {Leupers, Rainer and Marwedel, Peter}, title = {Time-Constrained Code Compaction for DSPs}, journal = {IEEE Trans. on VLSI Systems}, year = {1997}, volume = {5}, number = {1}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1997-ieee_tvlsi.pdf}, confidential = {n}, abstract = {This paper addresses instruction-level parallelism in code generation for DSPs. In presence of potential parallelism, the task of code generation includes code compaction, which parallelizes primitive processor operations under given dependency and resource constraints. Furthermore, DSP algorithms in most cases are required to guarantee real-time response. Since the exact execution speed of a DSP program is only known after compaction, real-time constraints should be taken into account during the compaction phase. While previous DSP code generators rely on rigid heuristics for compaction, we propose a novel approach to exact local code compaction based on an Integer Programming model, which handles time constraints. Due to a general problem formulation, the IP model also captures encoding restrictions and handles instructions having alternative encodings and side effects, and therefore applies to a large class of instruction formats. Capabilities and limitations of our approach are discussed for different DSPs.}, }This paper addresses instruction-level parallelism in code generation for DSPs. In presence of potential parallelism, the task of code generation includes code compaction, which parallelizes primitive processor operations under given dependency and resource constraints. Furthermore, DSP algorithms in most cases are required to guarantee real-time response. Since the exact execution speed of a DSP program is only known after compaction, real-time constraints should be taken into account during the compaction phase. While previous DSP code generators rely on rigid heuristics for compaction, we propose a novel approach to exact local code compaction based on an Integer Programming model, which handles time constraints. Due to a general problem formulation, the IP model also captures encoding restrictions and handles instructions having alternative encodings and side effects, and therefore applies to a large class of instruction formats. Capabilities and limitations of our approach are discussed for different DSPs. Ulrich Bieker and Andreas Neumann.Using Logic Programming and Coroutining for electronic CAD. Journal of Logic Programming 1996[BibTeX][PDF][Abstract]@article { bieker:1996:jlp, author = {Bieker, Ulrich and Neumann, Andreas}, title = {Using Logic Programming and Coroutining for electronic CAD}, journal = {Journal of Logic Programming}, year = {1996}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1996-jlp.pdf}, confidential = {n}, abstract = {We show how an extended Prolog can be exploited to implement different electronic CAD tools. Starting with a computer hardware description language (CHDL) several problems like digital circuit analysis, simulation, test generation and code generation for programmable microprocessors are discussed. For that purpose the MIMOLA (machine independent microprogramming language) system MSS (MIMOLA hardware design system) is presented. It is shown that logic programming techniques have several advantages especially in the area of integrated circuit design. One of the main advantages is the small code size which translates to easy maintenance. We make extensive use of two main features of standard Prolog and constraint logic programming, i.e. backtracking and coroutining mechanism to express Boolean constraints.}, }We show how an extended Prolog can be exploited to implement different electronic CAD tools. Starting with a computer hardware description language (CHDL) several problems like digital circuit analysis, simulation, test generation and code generation for programmable microprocessors are discussed. For that purpose the MIMOLA (machine independent microprogramming language) system MSS (MIMOLA hardware design system) is presented. It is shown that logic programming techniques have several advantages especially in the area of integrated circuit design. One of the main advantages is the small code size which translates to easy maintenance. We make extensive use of two main features of standard Prolog and constraint logic programming, i.e. backtracking and coroutining mechanism to express Boolean constraints. Wolfgang Schenk.Retargetable Code Generation for Parallel, Pipelined Processor Structures. In: P. Marwedel, G. Goossens: Code Generation for Embedded Processors 1995[BibTeX][PDF][Abstract]@article { schenk:1995:kap, author = {Schenk, Wolfgang}, title = {Retargetable Code Generation for Parallel, Pipelined Processor Structures}, journal = {In: P. Marwedel, G. Goossens: Code Generation for Embedded Processors}, year = {1995}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1995-kap.pdf}, confidential = {n}, abstract = {The demand for decreased turn around time in the design of programmable digital circuits requires CAD tools for synthesis, verification and code generation. Usually a RT level netlist is available as soon as the datapath is designed. Given the netlist and the behavior of the RT level modules, the proposed compiler maps a source program to the binary code of the target machine. The main tasks of the compiler are allocation, register allocation, scheduling and compaction. These tasks are highly interdependent. Some machine features such as operator chaining, multi-cycle operations, pipeline latency, load delay, delayed branch, or residual control give raise to instruction dependencies, which can be automatically extracted from the structural description. From the netlist the proposed compiler derives an internal target machine representation, that is general enough to support all target architecture features mentioned above. In case the hardware supports different operators for a given operation the code generator must not commit to one of them, until a suitable alternative can be determined. In order to generate high quality code and to support irregular architectures, the code generator examines the alternative code versions.}, }The demand for decreased turn around time in the design of programmable digital circuits requires CAD tools for synthesis, verification and code generation. Usually a RT level netlist is available as soon as the datapath is designed. Given the netlist and the behavior of the RT level modules, the proposed compiler maps a source program to the binary code of the target machine. The main tasks of the compiler are allocation, register allocation, scheduling and compaction. These tasks are highly interdependent. Some machine features such as operator chaining, multi-cycle operations, pipeline latency, load delay, delayed branch, or residual control give raise to instruction dependencies, which can be automatically extracted from the structural description. From the netlist the proposed compiler derives an internal target machine representation, that is general enough to support all target architecture features mentioned above. In case the hardware supports different operators for a given operation the code generator must not commit to one of them, until a suitable alternative can be determined. In order to generate high quality code and to support irregular architectures, the code generator examines the alternative code versions. Ulrich Bieker.Retargetable Compilation of Self-Test Programs Using Constraint Logic Programming. In: Code Generation for Embedded Processors (Edited by P. Marwedel and G. Goossens) 1995[BibTeX][PDF][Abstract]@article { bieker:1995:restart, author = {Bieker, Ulrich}, title = {Retargetable Compilation of Self-Test Programs Using Constraint Logic Programming}, journal = {In: Code Generation for Embedded Processors (Edited by P. Marwedel and G. Goossens)}, year = {1995}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1995-restart.pdf}, confidential = {n}, abstract = {This chapter presents a retargetable code generator specialized in the compilation of self-test programs and exploiting new techniques from Constraint Logic Programming (CLP). Firstly, we show how CLP can be exploited to improve the software production process especially for retargetable code generation and test generation. CLP combines the declarative paradigm of logic programming with the efficiency of constraint solving techniques. CLP systems come with built-in mechanisms for solving constraints over various domains. For example, satisfiability checkers support Boolean constraints and IP-solvers support integer domains. Furthermore, CLP makes it easier to solve problems concurrently, e.g. the phase coupling problem during code generation. Secondly, we present a solution for testing embedded processors. Thus we exploit CLP techniques for retargetable code generation to generate self-test programs, given a set of test patterns for each of the register transfer processor components.}, }This chapter presents a retargetable code generator specialized in the compilation of self-test programs and exploiting new techniques from Constraint Logic Programming (CLP). Firstly, we show how CLP can be exploited to improve the software production process especially for retargetable code generation and test generation. CLP combines the declarative paradigm of logic programming with the efficiency of constraint solving techniques. CLP systems come with built-in mechanisms for solving constraints over various domains. For example, satisfiability checkers support Boolean constraints and IP-solvers support integer domains. Furthermore, CLP makes it easier to solve problems concurrently, e.g. the phase coupling problem during code generation. Secondly, we present a solution for testing embedded processors. Thus we exploit CLP techniques for retargetable code generation to generate self-test programs, given a set of test patterns for each of the register transfer processor components. Birger Landwehr Peter Marwedel.Exploitation of component information in a RAM-based architectural synthesis system. in: G. Saucier (ed.): Logic and Architectural Synthesis 1995[BibTeX][PDF][Abstract]@article { marwedel:1995:todos, author = {Peter Marwedel, Birger Landwehr}, title = {Exploitation of component information in a RAM-based architectural synthesis system}, journal = {in: G. Saucier (ed.): Logic and Architectural Synthesis}, year = {1995}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1995-todos_ram.pdf}, confidential = {n}, abstract = {This paper describes how the TODOS microarchitecture synthesis system uses information about available library components during the synthesis phases. TODOS stands for "TOp Down Synthesis". TODOS is an extension of the work described by Marwedel in 1986. TODOS takes advantage of this information right from the beginning and contains an assignment algorithm considering more component-specific library details than other algorithms. Special care is taken about RAMs as library elements. Possible multiple concurrent accesses are considered in the scheduling and the assignment phases. Possibilities for scheduling reads and writes with common addresses in the same control step are exploited. The assignment algorithm simultaneously generates bindings to ALUs, immediate control fields and memory ports. The paper shows that some control steps do not influence the generated data path. Excluding these control steps from the assigment phase speeds up this phase. An even more important speedup is obtained by using special simplifying rules for the assignment problem at hand.}, }This paper describes how the TODOS microarchitecture synthesis system uses information about available library components during the synthesis phases. TODOS stands for "TOp Down Synthesis". TODOS is an extension of the work described by Marwedel in 1986. TODOS takes advantage of this information right from the beginning and contains an assignment algorithm considering more component-specific library details than other algorithms. Special care is taken about RAMs as library elements. Possible multiple concurrent accesses are considered in the scheduling and the assignment phases. Possibilities for scheduling reads and writes with common addresses in the same control step are exploited. The assignment algorithm simultaneously generates bindings to ALUs, immediate control fields and memory ports. The paper shows that some control steps do not influence the generated data path. Excluding these control steps from the assigment phase speeds up this phase. An even more important speedup is obtained by using special simplifying rules for the assignment problem at hand. Peter Marwedel.Code Generation for Embedded Processors: An Introduction. In: P. Marwedel, G. Goossens (eds.): Code Generation for Embedded Processors 1995[BibTeX][PDF][Abstract]@article { marwedel:1995:codegen, author = {Marwedel, Peter}, title = {Code Generation for Embedded Processors: An Introduction}, journal = {In: P. Marwedel, G. Goossens (eds.): Code Generation for Embedded Processors}, year = {1995}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1995-codegen-intro.pdf}, confidential = {n}, abstract = {In this contribution, we examine requirements and application scenarios for compilers for embedded systems and present some first approaches for the development of such tools, with emphasis on retargetability. Approaches can be classified according to the target model they use. Structural, behavioural and mixed models have been tried. We also mention recent work on instruction extraction which bridges the gap between these models. Furthermore, we propose a model for processor classification.}, }In this contribution, we examine requirements and application scenarios for compilers for embedded systems and present some first approaches for the development of such tools, with emphasis on retargetability. Approaches can be classified according to the target model they use. Structural, behavioural and mixed models have been tried. We also mention recent work on instruction extraction which bridges the gap between these models. Furthermore, we propose a model for processor classification. Jürgen Herrmann and Renate Beckmann.LEFT - A System that Learns Rules about VLSI- Design from Structural Descriptions. Applied Artificia Intelligence, An International Journal 8 1 1994[BibTeX][Abstract]@article { hermmann:1994, author = {Herrmann, J\"urgen and Beckmann, Renate}, title = {LEFT - A System that Learns Rules about VLSI- Design from Structural Descriptions}, journal = {Applied Artificia Intelligence, An International Journal}, year = {1994}, volume = {8}, number = {1}, confidential = {n}, abstract = {The system presented, LEFT, learns most specific generalizations (MSGs) from structural descriptions. The new inductive multi-staged generalization algorithm is based on several new or enhanced ideas that improve the quality of generalization and make it applicable to real-world problems: LEFT evaluates the quality of each generated MSG using weighted predicates. The algorithm distinguishes between important and less-important predicates. Built-in predicates are used to select alternative MSGs and improve the resulting hypothesis. The system has been applied successfully to chip-floorplanning - a subtask of VLSI-design. It acquires rules describing single floorplanning steps.}, }The system presented, LEFT, learns most specific generalizations (MSGs) from structural descriptions. The new inductive multi-staged generalization algorithm is based on several new or enhanced ideas that improve the quality of generalization and make it applicable to real-world problems: LEFT evaluates the quality of each generated MSG using weighted predicates. The algorithm distinguishes between important and less-important predicates. Built-in predicates are used to select alternative MSGs and improve the resulting hypothesis. The system has been applied successfully to chip-floorplanning - a subtask of VLSI-design. It acquires rules describing single floorplanning steps. Peter Marwedel.Implementations of IF -statements in the TODOS microarchitecture synthesis system. in: G. Saucier and J. Trilhe (Editors): Synthesis for Control Dominated Circuits (A-22) 1993[BibTeX][PDF][Abstract]@article { marwedel:1993:ifip, author = {Marwedel, Peter}, title = {Implementations of IF -statements in the TODOS microarchitecture synthesis system}, journal = {in: G. Saucier and J. Trilhe (Editors): Synthesis for Control Dominated Circuits (A-22)}, year = {1993}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1993-IFIP.pdf}, confidential = {n}, abstract = {In microarchitecure synthesis, early algorithms considered only a single implementation technique for IF -statements. Focus was on scheduling and on maximum hardware sharing. In this paper, we present available options in more detail. They are described by using explicit program transformations. Some of these techniques have the potential to consider optimizations beyond the classical basic block boundary while maintaining the simplicity of basic-block oriented approaches.}, }In microarchitecure synthesis, early algorithms considered only a single implementation technique for IF -statements. Focus was on scheduling and on maximum hardware sharing. In this paper, we present available options in more detail. They are described by using explicit program transformations. Some of these techniques have the potential to consider optimizations beyond the classical basic block boundary while maintaining the simplicity of basic-block oriented approaches. Peter Marwedel and Wolfgang Rosenstiel.Synthesis of Register-Transfer-Structures from Behavioral Descriptions (in German: Synthese von Register- Transfer-Strukturen aus Verhaltensbeschreibungen). Informatik-Spektrum 1992[BibTeX][PDF][Abstract]@article { marwedel:1992:is, author = {Marwedel, Peter and Rosenstiel, Wolfgang}, title = {Synthesis of Register-Transfer-Structures from Behavioral Descriptions (in German: Synthese von Register- Transfer-Strukturen aus Verhaltensbeschreibungen)}, journal = {Informatik-Spektrum}, year = {1992}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1992-InformatikSpektrum.pdf}, confidential = {n}, abstract = {The considerable increase in application-specific integrated circuits necessitates an essential reduction of design time as well as design cost. Usually the design automation starts with logic design. The task to deduce a first circuitstructure from a behavioural description is up to the designer. Structure synthesis -often described as "High-Level Synthesis", too -has already the aim to generate automatically a circuit structure on register transfer level from a behavioural specification. The contribution on hand explains the principles and methods, which form the basis of this structure synthesis. Zusammenfassung:In microarchitecure synthesis, early algorithms considered only a single implementation technique for IF -statements. Focus was on scheduling and on maximum hardware sharing. In this paper, we present available options in more detail. They are described by using explicit program transformations. Some of these techniques have the potential to consider optimizations beyond the classical basic block boundary while maintaining the simplicity of basic-block oriented approaches.}, }The considerable increase in application-specific integrated circuits necessitates an essential reduction of design time as well as design cost. Usually the design automation starts with logic design. The task to deduce a first circuitstructure from a behavioural description is up to the designer. Structure synthesis -often described as "High-Level Synthesis", too -has already the aim to generate automatically a circuit structure on register transfer level from a behavioural specification. The contribution on hand explains the principles and methods, which form the basis of this structure synthesis. Zusammenfassung:In microarchitecure synthesis, early algorithms considered only a single implementation technique for IF -statements. Focus was on scheduling and on maximum hardware sharing. In this paper, we present available options in more detail. They are described by using explicit program transformations. Some of these techniques have the potential to consider optimizations beyond the classical basic block boundary while maintaining the simplicity of basic-block oriented approaches. Jürgen Herrmann and Renate Beckmann.LEFT - A Learning Tool for Early Floorplanning. Microprocessing and Microprogramming 35 1992[BibTeX][Abstract]@article { herrmann:1992:euromicro, author = {Herrmann, J\"urgen and Beckmann, Renate}, title = {LEFT - A Learning Tool for Early Floorplanning}, journal = {Microprocessing and Microprogramming}, year = {1992}, volume = {35}, confidential = {n}, abstract = {In this paper a new, interactive approach to floorplanning is presented. The learning tool LEFT provides an environment for the creation of floorplan topologies. LEFT's input is a list of blocks to be placed on a two-dimensional area and a specification of their connections. The designer uses LEFT as an extended, graphical design editor and selects operators that perform single floorplanning steps. In this way a floorplan is created interactively. LEFT's learning component creates from observed design steps specific production rules and generalizes them by means of a new and optimized inductive machine learning mechanism. Each time a rule matches on the current state of the design process LEFT proposes the execution of the corresponding operator in the right-hand rule side of the rule. In this way LEFT works as an apprentice system that supports the creation of early floorplans, adapts to the design style of the user and increases its knowledge during the use of the system.}, }In this paper a new, interactive approach to floorplanning is presented. The learning tool LEFT provides an environment for the creation of floorplan topologies. LEFT's input is a list of blocks to be placed on a two-dimensional area and a specification of their connections. The designer uses LEFT as an extended, graphical design editor and selects operators that perform single floorplanning steps. In this way a floorplan is created interactively. LEFT's learning component creates from observed design steps specific production rules and generalizes them by means of a new and optimized inductive machine learning mechanism. Each time a rule matches on the current state of the design process LEFT proposes the execution of the corresponding operator in the right-hand rule side of the rule. In this way LEFT works as an apprentice system that supports the creation of early floorplans, adapts to the design style of the user and increases its knowledge during the use of the system. Wolfgang Schenk Peter Marwedel.Improving the Performance of High-Level Synthesis. Microprocessing and Microprogramming 27, pages 381-388 1989[BibTeX][PDF][Abstract]@article { marwedel:1989:schenk, author = {Peter Marwedel, Wolfgang Schenk}, title = {Improving the Performance of High-Level Synthesis}, journal = {Microprocessing and Microprogramming}, year = {1989}, volume = {27}, pages = {381-388}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1989-euromicro-marwedel.pdf}, confidential = {n}, abstract = {In this paper we study possible improvements of high-level (architectural) synthesis processes. We allow the designer to indicate a set of bindings between behaviour and structure in order to add some of the designer's knowledge to the design process. These bindings can be used to exclude inefficient designs. The remaining design space may then be studied in more detail, using unified backtracking. Backtracking, together with preliminary floor-planning, is required for area-efficient designs.}, }In this paper we study possible improvements of high-level (architectural) synthesis processes. We allow the designer to indicate a set of bindings between behaviour and structure in order to add some of the designer's knowledge to the design process. These bindings can be used to exclude inefficient designs. The remaining design space may then be studied in more detail, using unified backtracking. Backtracking, together with preliminary floor-planning, is required for area-efficient designs. Peter Marwedel.On the Use of Hierarchies in the MIMOLA Hardware Design System. CompEuro Conf. on Computers and VLSI, pages 944-948 1987, OCR errors are possible[BibTeX][PDF][Abstract]@article { marwedel:1987:compeuro, author = {Marwedel, Peter}, title = {On the Use of Hierarchies in the MIMOLA Hardware Design System}, journal = {CompEuro Conf. on Computers and VLSI}, year = {1987}, pages = {944-948}, note = {OCR errors are possible}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1987-CompEuro.pdf}, confidential = {n}, abstract = {Until recently, the use of hierarchies in CAD for VLSI has almost exclusively been restricted to layout and simulation problems. The paper describes representation and handling of design hierarchies in the MIMOLA design system, featuring RT-level synthesis, test program generation and retargetable code generation. A method of embedding these tools in a common design environment, providing access to the hierarchy, is described. Advantages of having a design hierarchy are presented for each of the tools. In addition, relevant features of the MIMOLA language are explained.}, }Until recently, the use of hierarchies in CAD for VLSI has almost exclusively been restricted to layout and simulation problems. The paper describes representation and handling of design hierarchies in the MIMOLA design system, featuring RT-level synthesis, test program generation and retargetable code generation. A method of embedding these tools in a common design environment, providing access to the hierarchy, is described. Advantages of having a design hierarchy are presented for each of the tools. In addition, relevant features of the MIMOLA language are explained. Peter Marwedel.An Algorithm for the Synthesis of Processor Structures from Behavioural Specification. Microprocessing and Microprogramming (EUROMICRO Journal) 18, pages 944-948 1986[BibTeX][PDF][Abstract]@article { marwedel:1986:euromicro, author = {Marwedel, Peter}, title = {An Algorithm for the Synthesis of Processor Structures from Behavioural Specification}, journal = {Microprocessing and Microprogramming (EUROMICRO Journal)}, year = {1986}, volume = {18}, pages = {944-948}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1986-Euromicro.pdf}, confidential = {n}, abstract = {This paper describes a method for the automatic generation of the internal structure of digital processors from a specification of the required behaviour. The latter is specified by a high-level, PASCAL-like program. The internal structure is described in terms of memories, arithmetic/logic function boxes, multiplexers and their interconnections. In order to reduce the complexity of the design process, it is,partitioned into a sequence of individual steps. These steps include a flexible expression decomposition, a statement scheduling phase, anew module selection method and optimizations of interconnections and instruction word length.}, }This paper describes a method for the automatic generation of the internal structure of digital processors from a specification of the required behaviour. The latter is specified by a high-level, PASCAL-like program. The internal structure is described in terms of memories, arithmetic/logic function boxes, multiplexers and their interconnections. In order to reduce the complexity of the design process, it is,partitioned into a sequence of individual steps. These steps include a flexible expression decomposition, a statement scheduling phase, anew module selection method and optimizations of interconnections and instruction word length. P. Marwedel.The MIMOLA Design System: A Design System Which Spans Several Levels. Methodologies for COmputer System Design 1985[BibTeX][PDF][Abstract]@article { marwedel:1985, author = {Marwedel, P.}, title = {The MIMOLA Design System: A Design System Which Spans Several Levels}, journal = {Methodologies for COmputer System Design}, year = {1985}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1985-ifip-Design-System.pdf}, confidential = {n}, abstract = {The MIMOLA design system is a computer-aided system for the design of digital computers. The system uses requirements containing typical ap plication programs of the computer to be de signed. The output of the design process is a register-transfer level description of the com puter. Thus the MIMOLA system covers several design levels. The paper presents the method, the CAD-tools and some applications. CAD tools generate parallel programs from sequential pro grams, synthesize hardware structures, gene rate code and evaluate hardware structures. The designer plays an active role in the design process by bringing in his ideas about design improvements.}, }The MIMOLA design system is a computer-aided system for the design of digital computers. The system uses requirements containing typical ap plication programs of the computer to be de signed. The output of the design process is a register-transfer level description of the com puter. Thus the MIMOLA system covers several design levels. The paper presents the method, the CAD-tools and some applications. CAD tools generate parallel programs from sequential pro grams, synthesize hardware structures, gene rate code and evaluate hardware structures. The designer plays an active role in the design process by bringing in his ideas about design improvements. Peter Marwedel.A Retargetabe Compiler for a High-Level Microprogramming Language. ACM Sigmicro Newsletter 15 4 1984[BibTeX][PDF][Abstract]@article { marwedel:1984:acm, author = {Marwedel, Peter}, title = {A Retargetabe Compiler for a High-Level Microprogramming Language}, journal = {ACM Sigmicro Newsletter}, year = {1984}, volume = {15}, number = {4}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1984-Sigmicro-15.pdf}, confidential = {n}, abstract = {A compiler for the generation of microcode for a high-level microprogramming language is presented. The compiler is target maschine independent. The inpud to the compiler cosists of a hardware description, a high-level microprogram and a set of program transformation rules. The compiler is able to take advantage of optimization techniques which are used by microprogrammers because many of these can be represented by program transformation rules.}, }A compiler for the generation of microcode for a high-level microprogramming language is presented. The compiler is target maschine independent. The inpud to the compiler cosists of a hardware description, a high-level microprogram and a set of program transformation rules. The compiler is able to take advantage of optimization techniques which are used by microprogrammers because many of these can be represented by program transformation rules. Gerhard Zimmermann, Richard Cloutier, Richard Rudell, Mark Albert and Beth Hurd.Mimola Software System Primer (Version MSS1). Computer Hardware Description Langauges and their Applications, pages 281-292 December 1982[BibTeX][PDF][Abstract]@article { marwedel:81:ifip, author = {Zimmermann, Gerhard and Cloutier, Richard and Rudell, Richard and Albert, Mark and Hurd, Beth}, title = {Mimola Software System Primer (Version MSS1)}, journal = {Computer Hardware Description Langauges and their Applications}, year = {1982}, pages = {281-292}, month = {dec}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1982-Mimolaprimer.pdf}, confidential = {n}, abstract = {Mimola Primer}, }Mimola Primer Gerhard Zimmermann.COMPUTER AIDED SYNTHESIS OF DIGITAL-SYSTEMS. COMPUTER HARDWARE DESCRIPTION LANGUAGES AND THEIR APPLICATIONS 1981[BibTeX][PDF][Abstract]@article { zimm:1981:ifip, author = {Zimmermann, Gerhard}, title = {COMPUTER AIDED SYNTHESIS OF DIGITAL-SYSTEMS}, journal = {COMPUTER HARDWARE DESCRIPTION LANGUAGES AND THEIR APPLICATIONS}, year = {1981}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1981-IFIP-Zimmermann.pdf}, confidential = {n}, abstract = {The design of a digital system can typically be split into phases, which are defined here as...}, }The design of a digital system can typically be split into phases, which are defined here as... Peter Marwedel.A Retargetable Microcode Generation System for a High-Level Microprogramming Language. ACM Sigmicro Newsletter 12, pages 115-123 1981[BibTeX][PDF][Abstract]@article { marwedel:1981:sigmicro, author = {Marwedel, Peter}, title = {A Retargetable Microcode Generation System for a High-Level Microprogramming Language}, journal = {ACM Sigmicro Newsletter}, year = {1981}, volume = {12}, pages = {115-123}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1981-Sigmicro-12.pdf}, confidential = {n}, abstract = {A system for the generation of microcode from a high-level micro-programming language is presented. The system is independent of the target maschine because it is table-driven by a separate hardware declaration. It is applicable for horizontally microprogrammed maschines.}, }A system for the generation of microcode from a high-level micro-programming language is presented. The system is independent of the target maschine because it is table-driven by a separate hardware declaration. It is applicable for horizontally microprogrammed maschines. Peter Marwedel.The Design of a Subprocessor with Dynamic Microprogramming with MIMOLA. Informatik-Fachberichte 27, pages 164-177 1980[BibTeX][PDF][Abstract]@article { marwedel:1980:gi, author = {Marwedel, Peter}, title = {The Design of a Subprocessor with Dynamic Microprogramming with MIMOLA}, journal = {Informatik-Fachberichte}, year = {1980}, volume = {27}, pages = {164-177}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1980-gi-Vol27.pdf}, confidential = {n}, abstract = {MIMOLA is a language for the optimized design of digital processors, bases upon computing resource utilizations for typical programs. It has been used for the design of a well-structured, fast, parallel and microprogrammable processor. Although not larger than a conventional minicomputer, it is about 26 times faster. It proves, that microcode need not be larger than equivalent machinecode. This paper also discusses possible architecture alternatives with low cost/performance ratios.}, }MIMOLA is a language for the optimized design of digital processors, bases upon computing resource utilizations for typical programs. It has been used for the design of a well-structured, fast, parallel and microprogrammable processor. Although not larger than a conventional minicomputer, it is about 26 times faster. It proves, that microcode need not be larger than equivalent machinecode. This paper also discusses possible architecture alternatives with low cost/performance ratios. Peter Marwedel.Influence of linear and nonlinear systems on stochastic processes (in German: Einfluss linearer und nichtlinearer Systeme auf stochastische Prozesse). Archiv fuer Elektronik und Uebertragungstechnik (AEUe) 29, pages 480-484 1975, OCR errors are possible[BibTeX][PDF][Abstract]@article { marwedel:1975:journal, author = {Marwedel, Peter}, title = {Influence of linear and nonlinear systems on stochastic processes (in German: Einfluss linearer und nichtlinearer Systeme auf stochastische Prozesse)}, journal = {Archiv fuer Elektronik und Uebertragungstechnik (AEUe)}, year = {1975}, volume = {29}, pages = {480-484}, note = {OCR errors are possible}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1975-marwedel.pdf}, confidential = {n}, abstract = {The Influence of Linear and Nonlinear Systems upon Stochastic Processes This contribution is concerned with the relations between the autocorrelation functions of stochastic processes at the in- and outputs of linear and nonlinear systems. The classical theory for second-order autocorrelation functions is extended. Generalized autocorrelation functions (general-order moment functions) are introduced and the general-order autocorrelation function of a Gaussian process that is passed first through a zero-memory nonlinear system, then through a linear filter is given . The procedure used is based upon the characteristic function method. Zusammenfassung: Dieser Beitrag befa\"st sich mit den Zusammenh\"angen zwischen den Autokorrelationsfunktionen (AKF) stochastischer Prozesse an Ein- und Ausg\"angen linearer und nichtlinearer Systeme. Die klassische Theorie f\"ur AKF 2. Ordnung wird erg\"anzt. Au\"serdem werden auf beliebige Ordnung verallgemeinerte AKF ("Momentfunktionen") eingef\"uhrt und Aussagen \"uber die verallgemeinerten AKF nichtlinear geformter und anschlie\"send linear gefilterter gau\"sscher Prozesse getroffen. Das verwendete Verfahren basiert auf der Methode der charakteristischen Funktion.}, }The Influence of Linear and Nonlinear Systems upon Stochastic Processes This contribution is concerned with the relations between the autocorrelation functions of stochastic processes at the in- and outputs of linear and nonlinear systems. The classical theory for second-order autocorrelation functions is extended. Generalized autocorrelation functions (general-order moment functions) are introduced and the general-order autocorrelation function of a Gaussian process that is passed first through a zero-memory nonlinear system, then through a linear filter is given . The procedure used is based upon the characteristic function method. Zusammenfassung: Dieser Beitrag befaßt sich mit den Zusammenhängen zwischen den Autokorrelationsfunktionen (AKF) stochastischer Prozesse an Ein- und Ausgängen linearer und nichtlinearer Systeme. Die klassische Theorie für AKF 2. Ordnung wird ergänzt. Außerdem werden auf beliebige Ordnung verallgemeinerte AKF ("Momentfunktionen") eingeführt und Aussagen über die verallgemeinerten AKF nichtlinear geformter und anschließend linear gefilterter gaußscher Prozesse getroffen. Das verwendete Verfahren basiert auf der Methode der charakteristischen Funktion.
 Jian-Jia Chen, Geoffrey Nelissen, Wen-Hung Huang, Maolin Yang, Björn Brandenburg, Konstantinos Bletsas, Cong Liu, Pascal Richard, Frédéric Ridouard, Neil Audsley, Raj Rajkumar, Dionisio Niz and Georg von der Brüggen.Many Suspensions, Many Problems: A Review of Self-Suspending Tasks in Real-Time Systems.Technical Report #854, Department of Computer Science, TU Dortmund March 2017, (Status: Preprint, 2nd Version) The first version was published in May 2016.[BibTeX][PDF][Abstract]@techreport { ChenReport854-2016, author = {Chen, Jian-Jia and Nelissen, Geoffrey and Huang, Wen-Hung and Yang, Maolin and Brandenburg, Bj\"orn and Bletsas, Konstantinos and Liu, Cong and Richard, Pascal and Ridouard, Fr\'ed\'eric and Audsley, Neil and Rajkumar, Raj and Niz, Dionisio and Br\"uggen, Georg von der}, title = {Many Suspensions, Many Problems: A Review of Self-Suspending Tasks in Real-Time Systems}, institution = {Department of Computer Science, TU Dortmund}, year = {2017}, number = {854}, month = {March}, note = { (Status: Preprint, 2nd Version) The first version was published in May 2016.}, keywords = {kevin, Georg}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2017-chen-techreport-854-v2.pdf}, confidential = {n}, abstract = {In general computing systems, a job (process/task) may suspend itself whilst it is waiting for some activity to complete, e.g., an accelerator to return data. % or results from offloaded computation. In real-time systems, such self-suspension can cause substantial performance/schedulability degradation. This observation, first made in 1988, has led to the investigation of the impact of self-suspension on timing predictability, and many relevant results have been published since. Unfortunately, as it has recently come to light, a number of the existing results are flawed. To provide a correct platform on which future research can be built, this paper reviews the state of the art in the design and analysis of scheduling algorithms and schedulability tests for self-suspending tasks in real-time systems. We provide (1)~a systematic description of how self-suspending tasks can be handled in both soft and hard real-time systems; (2)~an explanation of the existing misconceptions and their potential remedies; (3)~an assessment of the influence of such flawed analyses on partitioned multiprocessor fixed-priority scheduling when tasks synchronize access to shared resources; and (4)~a discussion of the computational complexity of analyses for different self-suspension task models. }, }In general computing systems, a job (process/task) may suspend itself whilst it is waiting for some activity to complete, e.g., an accelerator to return data. % or results from offloaded computation. In real-time systems, such self-suspension can cause substantial performance/schedulability degradation. This observation, first made in 1988, has led to the investigation of the impact of self-suspension on timing predictability, and many relevant results have been published since. Unfortunately, as it has recently come to light, a number of the existing results are flawed. To provide a correct platform on which future research can be built, this paper reviews the state of the art in the design and analysis of scheduling algorithms and schedulability tests for self-suspending tasks in real-time systems. We provide (1) a systematic description of how self-suspending tasks can be handled in both soft and hard real-time systems; (2) an explanation of the existing misconceptions and their potential remedies; (3) an assessment of the influence of such flawed analyses on partitioned multiprocessor fixed-priority scheduling when tasks synchronize access to shared resources; and (4) a discussion of the computational complexity of analyses for different self-suspension task models. Jian-Jia Chen, Geoffrey Nelissen and Wen-Hung Kevin Huang.A Unifying Response Time Analysis Framework for Dynamic Self-Suspending Tasks.Technical Report #850, Fakultät für Informatik, Technische Universität Dortmund 2016, This is an extended version of the same titled paper in ECRTS 2016[BibTeX][PDF][Abstract]@techreport { ChenReport850-suspension, author = {Chen, Jian-Jia and Geoffrey Nelissen, and Huang, Wen-Hung Kevin}, title = {A Unifying Response Time Analysis Framework for Dynamic Self-Suspending Tasks}, institution = {Fakult\"at f\"ur Informatik, Technische Universit\"at Dortmund}, year = {2016}, number = {850}, note = {This is an extended version of the same titled paper in ECRTS 2016}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2016-chen-report-850.pdf}, confidential = {n}, abstract = { For real-time embedded systems, self-suspending behaviors can cause substantial performance/schedulability degradations. In this paper, we focus on preemptive fixed-priority scheduling for the dynamic self-suspension task model on uniprocessor. This model assumes that a job of a task can dynamically suspend itself during its execution (for instance, to wait for shared resources or access co-processors or external devices). The total suspension time of a job is upper-bounded, but this dynamic behavior drastically influences the interference generated by this task on lower-priority tasks. The state-of-the-art results for this task model can be classified into three categories (i) modeling suspension as computation, (ii) modeling suspension as release jitter, and (iii) modeling suspension as a blocking term. However, several results associated to the release jitter approach have been recently proven to be erroneous, and the concept of modeling suspension as blocking was never formally proven correct. This paper presents a unifying response time analysis framework for the dynamic self-suspending task model. We provide a rigorous proof and show that the existing analyses pertaining to the three categories mentioned above are analytically dominated by our proposed solution. Therefore, all those techniques are in fact correct, but they are inferior to the proposed response time analysis in this paper. The evaluation results show that our analysis framework can generate huge improvements (an increase of up to $50\%$ of the number of task sets deemed schedulable) over these state-of-the-art analyses.}, } For real-time embedded systems, self-suspending behaviors can cause substantial performance/schedulability degradations. In this paper, we focus on preemptive fixed-priority scheduling for the dynamic self-suspension task model on uniprocessor. This model assumes that a job of a task can dynamically suspend itself during its execution (for instance, to wait for shared resources or access co-processors or external devices). The total suspension time of a job is upper-bounded, but this dynamic behavior drastically influences the interference generated by this task on lower-priority tasks. The state-of-the-art results for this task model can be classified into three categories (i) modeling suspension as computation, (ii) modeling suspension as release jitter, and (iii) modeling suspension as a blocking term. However, several results associated to the release jitter approach have been recently proven to be erroneous, and the concept of modeling suspension as blocking was never formally proven correct. This paper presents a unifying response time analysis framework for the dynamic self-suspending task model. We provide a rigorous proof and show that the existing analyses pertaining to the three categories mentioned above are analytically dominated by our proposed solution. Therefore, all those techniques are in fact correct, but they are inferior to the proposed response time analysis in this paper. The evaluation results show that our analysis framework can generate huge improvements (an increase of up to $50%$ of the number of task sets deemed schedulable) over these state-of-the-art analyses. Jian-Jia Chen.Erratum: Global Deadline-Monotonic Scheduling of Arbitrary-Deadline Sporadic Task Systems.Technical Report, TU Dortmund 2016[BibTeX][PDF][Abstract]@techreport { ChenErratum-globalDM-2007, author = {Chen, Jian-Jia}, title = {Erratum: Global Deadline-Monotonic Scheduling of Arbitrary-Deadline Sporadic Task Systems}, institution = {TU Dortmund}, year = {2016}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2016-chen-erratum-globalDM.pdf}, confidential = {n}, abstract = { This paper presents an error in the schedulability test for global deadline-monotonic scheduling of arbitrary-deadline sporadic task systems in identical multiprocessor systems proposed by Baruah and Fisher in OPODIS 2007. This erratum provides a simple fix. Fortunately, the speedup bound $2+\sqrt{3}$ claimed in their paper remains valid with this simple fix.}, } This paper presents an error in the schedulability test for global deadline-monotonic scheduling of arbitrary-deadline sporadic task systems in identical multiprocessor systems proposed by Baruah and Fisher in OPODIS 2007. This erratum provides a simple fix. Fortunately, the speedup bound $2+3$ claimed in their paper remains valid with this simple fix. Huang Wen-Hung and Jian-Jia Chen.Schedulability and Priority Assignment for Multi-Segment Self-Suspending Real-Time Tasks under Fixed-Priority Scheduling.Technical Report, TU Dortmund 2015[BibTeX][PDF][Abstract]@techreport { WJ15, author = {Wen-Hung, Huang and Chen, Jian-Jia}, title = {Schedulability and Priority Assignment for Multi-Segment Self-Suspending Real-Time Tasks under Fixed-Priority Scheduling}, institution = {TU Dortmund}, year = {2015}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2015-technical-report-multi-seg-Kevin.pdf}, confidential = {n}, abstract = {Self-suspension is becoming an increasingly prominent characteristic in real-time systems such as: (i) I/O-intensive systems (ii) multi-core processors, and (iii) computation offloading systems with coprocessors, like Graphics Processing Units (GPUs). In this paper, we study the schedulability of multi-segment self-suspension tasks under fixed-priority scheduling, where the executions of a multi-segment self-suspension task alternate between per-defined computation segments and suspension intervals. In particular, we do not use any enforcement to control the releases of computation segments and suspension intervals. Such an enforcement can prevent jitter but may incur non-negligible overheads. This work presents a combined method using the proposed multi-segment workload function to compute the upper bound on the worst-case response time (WCRT) of multi-segment tasks. To the best of our knowledge, this is the first study that successfully provides a pseudo-polynomial-time test for multi-segment self-suspension, hard real-time systems under fixed-priority scheduling without any additional execution control. We also show that the proposed analysis is compatible with Audsley's Priority Assignment. Our empirical investigations show that the proposed approach is highly effective in terms of the number of task sets deemed to be schedulable. }, }Self-suspension is becoming an increasingly prominent characteristic in real-time systems such as: (i) I/O-intensive systems (ii) multi-core processors, and (iii) computation offloading systems with coprocessors, like Graphics Processing Units (GPUs). In this paper, we study the schedulability of multi-segment self-suspension tasks under fixed-priority scheduling, where the executions of a multi-segment self-suspension task alternate between per-defined computation segments and suspension intervals. In particular, we do not use any enforcement to control the releases of computation segments and suspension intervals. Such an enforcement can prevent jitter but may incur non-negligible overheads. This work presents a combined method using the proposed multi-segment workload function to compute the upper bound on the worst-case response time (WCRT) of multi-segment tasks. To the best of our knowledge, this is the first study that successfully provides a pseudo-polynomial-time test for multi-segment self-suspension, hard real-time systems under fixed-priority scheduling without any additional execution control. We also show that the proposed analysis is compatible with Audsley's Priority Assignment. Our empirical investigations show that the proposed approach is highly effective in terms of the number of task sets deemed to be schedulable. Helena Kotthaus, Ingo Korb and Peter Marwedel.Performance Analysis for Parallel R Programs: Towards Efficient Resource Utilization.Technical Report #01/2015, Department of Computer Science 12, TU Dortmund University July 2015, SFB876 Project A3[BibTeX][PDF][Link]@techreport { kotthaus/2015c, author = {Kotthaus, Helena and Korb, Ingo and Marwedel, Peter}, title = {Performance Analysis for Parallel R Programs: Towards Efficient Resource Utilization}, institution = {Department of Computer Science 12, TU Dortmund University}, year = {2015}, number = {01/2015}, month = {July}, note = {SFB876 Project A3}, url = {http://sfb876.tu-dortmund.de/SPP/sfb876-a3.html}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2015_kotthaus_tr.pdf}, confidential = {n}, } Konstantinos Bletsas, Neil Audsley, Wen-Hung Huang, Jian-Jia Chen and Geoffrey Nelissen.Errata for three papers (2004-05) on fixed-priority scheduling with self-suspensions.Technical Report #CISTER-TR-150713, CISTER July 2015[BibTeX][PDF][Abstract]@techreport { BletsasReport2015, author = {Bletsas, Konstantinos and Audsley, Neil and Huang, Wen-Hung and Chen, Jian-Jia and Nelissen, Geoffrey}, title = {Errata for three papers (2004-05) on fixed-priority scheduling with self-suspensions}, institution = {CISTER}, year = {2015}, number = {CISTER-TR-150713}, month = {July}, keywords = {kevin}, file = {http://www.cister.isep.ipp.pt/docs/errata_for_three_papers_(2004_05)_on_fixed_priority_scheduling_with_self_suspensions/1133/attach.pdf}, confidential = {n}, abstract = {The purpose of this short paper is to (i) highlight the flaws in previous published work (2004-2005), published by part of the authors, on worst-case response time analysis for tasks with self-suspensions and (ii) provide straightforward fixes for those flaws, rendering the analysis safe. }, }The purpose of this short paper is to (i) highlight the flaws in previous published work (2004-2005), published by part of the authors, on worst-case response time analysis for tasks with self-suspensions and (ii) provide straightforward fixes for those flaws, rendering the analysis safe. Florian Schmoll, Andreas Heinig, Peter Marwedel and Michael Engel.Passing error handling information from a compiler to runtime components.Technical Report #844, TU Dortmund, Faculty of Computer Science 12 2014[BibTeX][PDF][Abstract]@techreport { tr844, author = {Schmoll, Florian and Heinig, Andreas and Marwedel, Peter and Engel, Michael}, title = {Passing error handling information from a compiler to runtime components}, institution = {TU Dortmund, Faculty of Computer Science 12}, year = {2014}, type = {Technical Report}, number = {844}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2014-schmoll-tr844.pdf}, confidential = {n}, abstract = {For the handling of faults in embedded systems, software implemented fault tolerance seems to be more appropriate than hardware based approaches. Using software based techniques, runtime conditions only known to software can be considered. Also, the error handling can be application specific and there are more alternatives for decisions during error handling resulting in a more flexible approach. By adapting the error handling to the requirements of the software resources can be saved. A recent publication showed that a compiler that evaluates source-code annotations and applies static analyses can determine which errors require error handling, and which errors can be ignored safely. This error handling information can improve the efficiency of software implemented error handling. Additional information about how erroneous data can be handled is also provided by source code annotations. However, error handling information is needed at runtime, when error handling actually takes place. Unfortunately, the computation is too complex for embedded systems so that it cannot be computed on demand. Hence, the relevant information has to be precomputed at compile time, but must be retrievable at runtime. In this report we present how compiler generated information about the error handling of data objects can be made available to runtime components that apply error correction.}, }For the handling of faults in embedded systems, software implemented fault tolerance seems to be more appropriate than hardware based approaches. Using software based techniques, runtime conditions only known to software can be considered. Also, the error handling can be application specific and there are more alternatives for decisions during error handling resulting in a more flexible approach. By adapting the error handling to the requirements of the software resources can be saved. A recent publication showed that a compiler that evaluates source-code annotations and applies static analyses can determine which errors require error handling, and which errors can be ignored safely. This error handling information can improve the efficiency of software implemented error handling. Additional information about how erroneous data can be handled is also provided by source code annotations. However, error handling information is needed at runtime, when error handling actually takes place. Unfortunately, the computation is too complex for embedded systems so that it cannot be computed on demand. Hence, the relevant information has to be precomputed at compile time, but must be retrievable at runtime. In this report we present how compiler generated information about the error handling of data objects can be made available to runtime components that apply error correction. Jian-Jia Chen and Kunal Agrawal.Capacity Augmentation Bounds for Parallel DAG Tasks under G-EDF and G-RM.Technical Report #845, TU Dortmund July 2014[BibTeX][PDF][Abstract]@techreport { chenagrawal2014tech, author = {Chen, Jian-Jia and Agrawal, Kunal}, title = {Capacity Augmentation Bounds for Parallel DAG Tasks under G-EDF and G-RM}, institution = {TU Dortmund}, year = {2014}, number = {845}, month = {July}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/techreport-jjchen-july2014.pdf}, confidential = {n}, abstract = {This paper considers global earliest-deadline-first (EDF) and global rate-monotonic scheduling for a general task model for parallel sporadic real-time tasks. In particular, each sporadic real-time task is characterized by the general directed acyclic graph (DAG). This paper provides the utilization-based analysis to test the schedulability of global EDF and global rate-monotonic scheduling. We show that if on unit-speed processors, a task set has total utilization of at most m and the critical path length of each task is smaller than its deadline, then global EDF can schedule that task set on m processors of speed 2.6181, defined as the capacity augmentation bound. Together with the lower bound on the speeding up, we close the gap for global EDF when m is sufficiently large. This is the best known capacity augmentation bound for parallel DAG tasks ￼￼￼under any scheduling strategy. In addition, we also show that global rate monotonic scheduling has a capacity augmentation bound of 3.7321 with ￼a similar analysis procedure, the best known capacity augmentation bound for fixed priority scheduling of the general DAG tasks. For global EDF and global RM, we also present utilization-based schedulability analysis tests based on the utilization and the maximum critical path utilization.}, }This paper considers global earliest-deadline-first (EDF) and global rate-monotonic scheduling for a general task model for parallel sporadic real-time tasks. In particular, each sporadic real-time task is characterized by the general directed acyclic graph (DAG). This paper provides the utilization-based analysis to test the schedulability of global EDF and global rate-monotonic scheduling. We show that if on unit-speed processors, a task set has total utilization of at most m and the critical path length of each task is smaller than its deadline, then global EDF can schedule that task set on m processors of speed 2.6181, defined as the capacity augmentation bound. Together with the lower bound on the speeding up, we close the gap for global EDF when m is sufficiently large. This is the best known capacity augmentation bound for parallel DAG tasks ￼￼￼under any scheduling strategy. In addition, we also show that global rate monotonic scheduling has a capacity augmentation bound of 3.7321 with ￼a similar analysis procedure, the best known capacity augmentation bound for fixed priority scheduling of the general DAG tasks. For global EDF and global RM, we also present utilization-based schedulability analysis tests based on the utilization and the maximum critical path utilization. Timon Kelter, Heiko Falk, Peter Marwedel, Sudipta Chattopadhyay and Abhik Roychoudhury.Bus-Aware Multicore WCET Analysis through TDMA Offset Bounds.Technical Report #837, TU Dortmund, Faculty of Computer Science 12 January 2011[BibTeX][PDF][Abstract]@techreport { KFM+11, author = {Kelter, Timon and Falk, Heiko and Marwedel, Peter and Chattopadhyay, Sudipta and Roychoudhury, Abhik}, title = {Bus-Aware Multicore WCET Analysis through TDMA Offset Bounds}, institution = {TU Dortmund, Faculty of Computer Science 12}, year = {2011}, type = {Technical Report}, number = {837}, month = {jan}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2011-kelterTR837.pdf}, confidential = {n}, abstract = {In the domain of real-time systems, the analysis of the timing behavior of programs is crucial for guaranteeing the schedulability and thus the safeness of a system. Static analyses of the \textit{WCET} (Worst-Case Execution Time) have proven to be a key element for timing analysis, as they provide safe upper bounds on a program's execution time. For single-core systems, industrial-strength WCET analyzers are already available, but up to now, only first proposals have been made to analyze the WCET in multicore systems, where the different cores may interfere during the access to shared resources. An important example for this are shared buses which connect the cores to a shared main memory. The time to gain access to the shared bus may vary significantly, depending on the used bus arbitration protocol and the access timings. In this report, we propose a new technique for analyzing the duration of accesses to shared buses. We implemented a prototype tool which uses the new analysis and tested it on a set of realworld benchmarks. Results demonstrate that our analysis achieves the same precision as the best existing approach while drastically outperforming it in matters of analysis time.}, }In the domain of real-time systems, the analysis of the timing behavior of programs is crucial for guaranteeing the schedulability and thus the safeness of a system. Static analyses of the WCET (Worst-Case Execution Time) have proven to be a key element for timing analysis, as they provide safe upper bounds on a program's execution time. For single-core systems, industrial-strength WCET analyzers are already available, but up to now, only first proposals have been made to analyze the WCET in multicore systems, where the different cores may interfere during the access to shared resources. An important example for this are shared buses which connect the cores to a shared main memory. The time to gain access to the shared bus may vary significantly, depending on the used bus arbitration protocol and the access timings. In this report, we propose a new technique for analyzing the duration of accesses to shared buses. We implemented a prototype tool which uses the new analysis and tested it on a set of realworld benchmarks. Results demonstrate that our analysis achieves the same precision as the best existing approach while drastically outperforming it in matters of analysis time. Constantin Timm, Andrej Gelenberg, Frank Weichert and Peter Marwedel.Reducing the Energy Consumption of Embedded Systems by Integrating General Purpose GPUs.Technical Report #829, TU Dortmund, Faculty of Computer Science 12 2010[BibTeX][PDF][Abstract]@techreport { Timm:2010, author = {Timm, Constantin and Gelenberg, Andrej and Weichert, Frank and Marwedel, Peter}, title = {Reducing the Energy Consumption of Embedded Systems by Integrating General Purpose GPUs}, institution = {TU Dortmund, Faculty of Computer Science 12}, year = {2010}, number = {829}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2010-timmTR829.pdf}, confidential = {n}, abstract = {Nowadays, General Purpose Computing on GPUs (GPGPU) accelerates many industrial and scientific applications in the high performance computing (HPC) domain. Recently, GPU vendors, such as Nvidia and AMD, promoted the utilization of high end GPUs in embedded systems. The intention of the GPU vendors is the acceleration of traditional graphics computations, but in analogy to the HPC desktop domain, GPUs could also be used as GPGPU in the embedded domain. However, energy constraints are omnipresent in the embedded world and therefore, one central question for embedded system designers is: Can energy be saved by using an additional GPGPU-equipped graphics card to accelerate general purpose applications? This paper firstly discusses the theoretical background of an energy aware embedded system design including a GPGPU-equipped graphics card. In order to support these theoretical considerations, secondly an energy and runtime evaluation of a low power GPU/CPU system is presented. We demonstrate that a profitable GPU integration, seen from an energy perspective, strongly depends on the structure and the features of an application such as a high parallelizability and the utilization level of the graphics card. The evaluation of several real world benchmarks shows that increasing the systems power consumption by integrating a GPU can lead to a reduced overall energy consumption of a system.}, }Nowadays, General Purpose Computing on GPUs (GPGPU) accelerates many industrial and scientific applications in the high performance computing (HPC) domain. Recently, GPU vendors, such as Nvidia and AMD, promoted the utilization of high end GPUs in embedded systems. The intention of the GPU vendors is the acceleration of traditional graphics computations, but in analogy to the HPC desktop domain, GPUs could also be used as GPGPU in the embedded domain. However, energy constraints are omnipresent in the embedded world and therefore, one central question for embedded system designers is: Can energy be saved by using an additional GPGPU-equipped graphics card to accelerate general purpose applications? This paper firstly discusses the theoretical background of an energy aware embedded system design including a GPGPU-equipped graphics card. In order to support these theoretical considerations, secondly an energy and runtime evaluation of a low power GPU/CPU system is presented. We demonstrate that a profitable GPU integration, seen from an energy perspective, strongly depends on the structure and the features of an application such as a high parallelizability and the utilization level of the graphics card. The evaluation of several real world benchmarks shows that increasing the systems power consumption by integrating a GPU can lead to a reduced overall energy consumption of a system. Frank Weichert, Marcel Gaspar, Constantin Timm, Alexander Zybin, Evgeny Gurevich, Michael Engel, Heinrich Müller and Peter Marwedel.Signal Analysis and Classification for Plasmon Assisted Microscopy of Nanoobjects.Technical Report #830, TU Dortmund, Faculty of Computer Science 12 2010[BibTeX][PDF][Abstract]@techreport { Weichert:2010c, author = {Weichert, Frank and Gaspar, Marcel and Timm, Constantin and Zybin, Alexander and Gurevich, Evgeny and Engel, Michael and M{\"u}ller, Heinrich and Marwedel, Peter}, title = {Signal Analysis and Classification for Plasmon Assisted Microscopy of Nanoobjects}, institution = {TU Dortmund, Faculty of Computer Science 12}, year = {2010}, number = {830}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2010-weichertTR830.pdf}, confidential = {n}, abstract = {In this paper we suggest a novel technique for surface plasmon resonance assisted detection of viruses and nanoparticles which can be applied for rapid analysis of large data volumes. The future availability of such an efficient detection method for viruses is evident in terms of globally spreading virus infections. The technique is based on the segmentation of slices, aimed at an automatic identification of nanoparticles, in which detection is based on positionstationary spatiotemporal data using a one dimensional signal-analysis and -classification approach. As data source a CCD camera taking a sequence of snapshots from a surface plasmon assisted microscopy is used. A one dimensional intensity analysis approach is applied for segmentation by classifying time dependent 1D gray level profiles and combining them into spatial 2D segments.}, }In this paper we suggest a novel technique for surface plasmon resonance assisted detection of viruses and nanoparticles which can be applied for rapid analysis of large data volumes. The future availability of such an efficient detection method for viruses is evident in terms of globally spreading virus infections. The technique is based on the segmentation of slices, aimed at an automatic identification of nanoparticles, in which detection is based on positionstationary spatiotemporal data using a one dimensional signal-analysis and -classification approach. As data source a CCD camera taking a sequence of snapshots from a surface plasmon assisted microscopy is used. A one dimensional intensity analysis approach is applied for segmentation by classifying time dependent 1D gray level profiles and combining them into spatial 2D segments. Heinig. Andreas.R2G: Supporting POSIX like semantics in a distributed RTEMS system.Technical Report #836, TU Dortmund, Faculty of Computer Science 12 December 2010[BibTeX][PDF][Abstract]@techreport { heinig:2010:836, author = {Andreas, Heinig.}, title = {R2G: Supporting POSIX like semantics in a distributed RTEMS system}, institution = {TU Dortmund, Faculty of Computer Science 12}, year = {2010}, type = {Technical Report}, number = {836}, month = {dec}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2010-r2g-report-836.pdf}, confidential = {n}, abstract = {R2G (speak: R square G) is an extension to the open source RTEMS real-time operating system. The purpose of R2G is to remove the limitations of RTEMS in the context of multi-threaded applications and to support IMEC's RTLib, which implements the parallelization of the MNEMEE Tool Flow. R2G establishes the connection between several tools. The parallelized source code produced either by MPMH (IMEC) or by the MNEMME Tool Flow (MPMH + ICD-C + TGE) can now be executed on several simulators including MPARM and CoMET. Due to the high simulation speed of CoMET, large benchmark applications can be executed. We implemented two new CoMET based platforms with a flat respectively hierarchical memory layout. On the hierarchical platform, memory optimization and mapping tools can fully exploit their optimizations.}, }R2G (speak: R square G) is an extension to the open source RTEMS real-time operating system. The purpose of R2G is to remove the limitations of RTEMS in the context of multi-threaded applications and to support IMEC's RTLib, which implements the parallelization of the MNEMEE Tool Flow. R2G establishes the connection between several tools. The parallelized source code produced either by MPMH (IMEC) or by the MNEMME Tool Flow (MPMH + ICD-C + TGE) can now be executed on several simulators including MPARM and CoMET. Due to the high simulation speed of CoMET, large benchmark applications can be executed. We implemented two new CoMET based platforms with a flat respectively hierarchical memory layout. On the hierarchical platform, memory optimization and mapping tools can fully exploit their optimizations. Heiko Falk.Control Flow Optimization by Loop Nest Splitting at the Source Code Level.Technical Report #773, TU Dortmund, Faculty of Computer Science 12 2002[BibTeX][PDF][Abstract]@techreport { falk:2002, author = {Falk, Heiko}, title = {Control Flow Optimization by Loop Nest Splitting at the Source Code Level}, institution = {TU Dortmund, Faculty of Computer Science 12}, year = {2002}, type = {Technical Report}, number = {773}, keywords = {sco}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2002-TechReport-773.pdf}, confidential = {n}, abstract = {In recent years, the application of optimization techniques at the level of program source codes has increasingly attracted interest due to the high effectiveness and the inherent retargetability of such approaches. In this report, a novel source code transformation technique for control flow optimization called loop nest splitting is presented. The goal of this optimization is to reduce runtimes and energy consumption by minimizing the number of if-statements executed in loop nests of typical embedded multimedia applications. Complementary to already known optimizations in this area, we explicitly focus on the optimization of loop-variant if-statements. The analysis techniques required for performing loop nest splitting are illustrated in detail. They base on precise mathematic models combined with genetic algorithms. The analysis is done statically at compile time and does not rely on profiling. For a detailed evaluation of the benefits of loop nest splitting, the effects of our optimization with respect to instruction pipeline and cache behavior, runtimes, energy consumption and code sizes are shown. The application of our implemented tools for loop nest splitting to three real-life multimedia benchmarks leads to average reductions of pipeline stalls between 19.7\% and 64.8\% and an average decrease of instruction cache misses between 8.9\% and 45.3\%. Measurements on a variety of different programmable processors show average speed-ups between 23.6\% and 62.1\% of the benchmarks, whereas reductions of energy dissipation between 19.2\% and 57.6\% are observed.}, }In recent years, the application of optimization techniques at the level of program source codes has increasingly attracted interest due to the high effectiveness and the inherent retargetability of such approaches. In this report, a novel source code transformation technique for control flow optimization called loop nest splitting is presented. The goal of this optimization is to reduce runtimes and energy consumption by minimizing the number of if-statements executed in loop nests of typical embedded multimedia applications. Complementary to already known optimizations in this area, we explicitly focus on the optimization of loop-variant if-statements. The analysis techniques required for performing loop nest splitting are illustrated in detail. They base on precise mathematic models combined with genetic algorithms. The analysis is done statically at compile time and does not rely on profiling. For a detailed evaluation of the benefits of loop nest splitting, the effects of our optimization with respect to instruction pipeline and cache behavior, runtimes, energy consumption and code sizes are shown. The application of our implemented tools for loop nest splitting to three real-life multimedia benchmarks leads to average reductions of pipeline stalls between 19.7% and 64.8% and an average decrease of instruction cache misses between 8.9% and 45.3%. Measurements on a variety of different programmable processors show average speed-ups between 23.6% and 62.1% of the benchmarks, whereas reductions of energy dissipation between 19.2% and 57.6% are observed. Rajeshwari Banakar, Stefan Steinke, Bo-Sik Lee, M. Balakrishnan and Peter Marwedel.Comparison of Cache- and Scratch-Pad based Memory Systems with respect to Performance, Area and Energy Consumption.Technical Report #762, TU Dortmund, Faculty of Computer Science 12 2001[BibTeX][PDF][Abstract]@techreport { banakar:2001:762, author = {Banakar, Rajeshwari and Steinke, Stefan and Lee, Bo-Sik and Balakrishnan, M. and Marwedel, Peter}, title = {Comparison of Cache- and Scratch-Pad based Memory Systems with respect to Performance, Area and Energy Consumption}, institution = {TU Dortmund, Faculty of Computer Science 12}, year = {2001}, type = {Technical Report}, number = {762}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2001-TechReport-762.pdf}, confidential = {n}, abstract = {In this report we evaluate the options for low power on-chip memories during system design and configuration. Specifically, we compare the use of scratch pad memories with that of cache on the basis of performance, area and energy. The target architecture used in our experiments is the AT91M40400 microcontroller containing an ARM7TDMI core. A packing algorithm is used to map the memory objects of the benchmarks to the scratch pad. Area and energy for different scratch pad and cache sizes are computed using the CACTI tool while performance is derived using the trace results of the ARMulator. We observe area and performance improvements by using a scratch pad memory. For example, for bubble sort there is a performance improvement of 18\% from a hardware which needs 34\% less area. The scratch pad also needs less energy per access, due to the absence of tag comparison.}, }In this report we evaluate the options for low power on-chip memories during system design and configuration. Specifically, we compare the use of scratch pad memories with that of cache on the basis of performance, area and energy. The target architecture used in our experiments is the AT91M40400 microcontroller containing an ARM7TDMI core. A packing algorithm is used to map the memory objects of the benchmarks to the scratch pad. Area and energy for different scratch pad and cache sizes are computed using the CACTI tool while performance is derived using the trace results of the ARMulator. We observe area and performance improvements by using a scratch pad memory. For example, for bubble sort there is a performance improvement of 18% from a hardware which needs 34% less area. The scratch pad also needs less energy per access, due to the absence of tag comparison. Stefan Steinke, Rüdiger Schwarz, Lars Wehmeyer and Peter Marwedel.Low power code generation for a RISC processor by register pipelining.Technical Report #754, TU Dortmund, Faculty of Computer Science 12 2001[BibTeX][PDF][Abstract]@techreport { steinke:2001:754, author = {Steinke, Stefan and Schwarz, R\"udiger and Wehmeyer, Lars and Marwedel, Peter}, title = {Low power code generation for a RISC processor by register pipelining}, institution = {TU Dortmund, Faculty of Computer Science 12}, year = {2001}, type = {Technical Report}, number = {754}, keywords = {ecc}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2001-TechReport-754.pdf}, confidential = {n}, abstract = {This paper presents the implementation of the compiler technique register pipelining with respect to energy optimization and its comparison against performance optimization. Generally, programs optimized for performance are also energy optimized. An exception to this rule is shown where the use of register pipelining improves the energy consumption by 17\% while bringing down performance by 8.8\%. Therefore, a detailed consideration of energy consumption within the processor and the memories is necessary.}, }This paper presents the implementation of the compiler technique register pipelining with respect to energy optimization and its comparison against performance optimization. Generally, programs optimized for performance are also energy optimized. An exception to this rule is shown where the use of register pipelining improves the energy consumption by 17% while bringing down performance by 8.8%. Therefore, a detailed consideration of energy consumption within the processor and the memories is necessary. Stefan Steinke, Christoph Zobiegala, Lars Wehmeyer and Peter Marwedel.Moving Program Objects to Scratch-Pad Memory for Energy Reduction.Technical Report #756, TU Dortmund, Faculty of Computer Science 12 2001[BibTeX][PDF][Abstract]@techreport { steinke:2001:756, author = {Steinke, Stefan and Zobiegala, Christoph and Wehmeyer, Lars and Marwedel, Peter}, title = {Moving Program Objects to Scratch-Pad Memory for Energy Reduction}, institution = {TU Dortmund, Faculty of Computer Science 12}, year = {2001}, type = {Technical Report}, number = {756}, keywords = {ecc}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2001-TechReport-756.pdf}, confidential = {n}, abstract = {This paper presents a new approach for improving energy consumption of compiler generated software by using on-chip Scratch-Pad RAM more efficiently. This memory allocation technique moves program parts (functions or basic blocks and global data objects) into the limited Scratch-Pad RAM. Experimental results show that this technique saves up to 80\% of the total energy consumption depending on the application, the system architecture and the size of the Scratch-Pad RAM.}, }This paper presents a new approach for improving energy consumption of compiler generated software by using on-chip Scratch-Pad RAM more efficiently. This memory allocation technique moves program parts (functions or basic blocks and global data objects) into the limited Scratch-Pad RAM. Experimental results show that this technique saves up to 80% of the total energy consumption depending on the application, the system architecture and the size of the Scratch-Pad RAM. Manoj Kumar Jain, Lars Wehmeyer, Peter Marwedel and M. Balakrishnan.Register File Synthesis in ASIP Design.Technical Report #746, TU Dortmund, Faculty of Computer Science 12 2000[BibTeX][PDF][Abstract]@techreport { jain:2000:tech, author = {Jain, Manoj Kumar and Wehmeyer, Lars and Marwedel, Peter and Balakrishnan, M.}, title = {Register File Synthesis in ASIP Design}, institution = {TU Dortmund, Faculty of Computer Science 12}, year = {2000}, type = {Technical Report}, number = {746}, keywords = {ecc}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2000-TechReport-746.pdf}, confidential = {n}, abstract = {Interest in synthesis of Application Specific Instruction Set Processors or ASIPs has increased considerably and a number of methodologies have been proposed for ASIP design. A key step in ASIP synthesis involves deciding architectural features based on application requirements and constraints. In this report we observe the effect of changing register file size on the performance as well as power and energy consumption. Detailed data is generated and analyzed for a number of application programs. Results indicate that choice of an appropriate number of registers has a significant impact on performance.}, }Interest in synthesis of Application Specific Instruction Set Processors or ASIPs has increased considerably and a number of methodologies have been proposed for ASIP design. A key step in ASIP synthesis involves deciding architectural features based on application requirements and constraints. In this report we observe the effect of changing register file size on the performance as well as power and energy consumption. Detailed data is generated and analyzed for a number of application programs. Results indicate that choice of an appropriate number of registers has a significant impact on performance. Birger Landwehr.Improving processor architecture exploitation by genetic algorithm based algebraic optimization.Technical Report #747, TU Dortmund, Faculty of Computer Science 12 2000[BibTeX][PDF][Abstract]@techreport { landwehr:2000:tech, author = {Landwehr, Birger}, title = {Improving processor architecture exploitation by genetic algorithm based algebraic optimization}, institution = {TU Dortmund, Faculty of Computer Science 12}, year = {2000}, type = {Technical Report}, number = {747}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2000-TechReport-747.pdf}, confidential = {n}, abstract = {This report presents a new approach for the algebraic optimization of computationally intensive applications. The presented approach is based upon the paradigm of simulated evolution which has been proven for solving large non-linear optimization problems. We introduce a chromosomal representation of data-flow graphs which ensures that the correctness of algebraic transformations realized by the genetic operators recombination, mutation, and selection is always preserved. We also present different fitness functions allowing to simply adapt the algorithm to different processor architectures in order to produce the best feasible solution concerning the given target architecture. The presented method has beenintegrated as a C-to-C converter within a versatile compiler framework and has proven its efficiency for several DSP applications.}, }This report presents a new approach for the algebraic optimization of computationally intensive applications. The presented approach is based upon the paradigm of simulated evolution which has been proven for solving large non-linear optimization problems. We introduce a chromosomal representation of data-flow graphs which ensures that the correctness of algebraic transformations realized by the genetic operators recombination, mutation, and selection is always preserved. We also present different fitness functions allowing to simply adapt the algorithm to different processor architectures in order to produce the best feasible solution concerning the given target architecture. The presented method has beenintegrated as a C-to-C converter within a versatile compiler framework and has proven its efficiency for several DSP applications. S. S. Bhattacharyya, Rainer Leupers and Marwedel.Software Synthesis and Code Generation for Signal Processing Systems.Technical Report #UMIACS-TR-99-57, Institute for Advanced Computer Studies, University of Maryland, College Park 20742, USA September 1999[BibTeX][PDF]@techreport { bhatt:1999:umd, author = {Bhattacharyya, S. S. and Leupers, Rainer and Marwedel,}, title = {Software Synthesis and Code Generation for Signal Processing Systems}, institution = {Institute for Advanced Computer Studies, University of Maryland, College Park 20742, USA}, year = {1999}, type = {Technical Report}, number = {UMIACS-TR-99-57}, month = {September}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1999-tr-umd.pdf}, confidential = {n}, } Ulrich Bieker, Martin Kaibel, Peter Marwedel and Walter Geisselhardt.STAR-DUST: Hierarchical Test of Embedded Processors by Self-Test Programs.Technical Report #700, University of Dortmund, Dept. of CS XII 1998[BibTeX][PDF][Abstract]@techreport { bieker:1998:report, author = {Bieker, Ulrich and Kaibel, Martin and Marwedel, Peter and Geisselhardt, Walter}, title = {STAR-DUST: Hierarchical Test of Embedded Processors by Self-Test Programs}, institution = {University of Dortmund, Dept. of CS XII}, year = {1998}, type = {Technical Report}, number = {700}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1998-star-dust-report.pdf}, confidential = {n}, abstract = {This paper describes the hierarchical test-generation method STAR-DUST, using self-test program generator RESTART, test pattern generator DUST, fault simulator FAUST and SYNOPSYS logic synthesis tools. RESTART aims at supporting self-test of embedded processors. Its integration into the STAR-DUST environment allows test program generation for realistic fault assumptions and provides, for the first time, experimental data on the fault coverage that can be obtained for full processor models. Experimental data shows that fault masking is not a problem even though the considered processor has to perform result comparison and arithmetic operations in the same ALU.}, }This paper describes the hierarchical test-generation method STAR-DUST, using self-test program generator RESTART, test pattern generator DUST, fault simulator FAUST and SYNOPSYS logic synthesis tools. RESTART aims at supporting self-test of embedded processors. Its integration into the STAR-DUST environment allows test program generation for realistic fault assumptions and provides, for the first time, experimental data on the fault coverage that can be obtained for full processor models. Experimental data shows that fault masking is not a problem even though the considered processor has to perform result comparison and arithmetic operations in the same ALU. Jürgen Herrmann Renate Beckmann.Memory Synthesis for General Purpose Computers by use of Constraint Logic Programming.Research Report #684, University of Dortmund, Dept. of CS XII July 1998[BibTeX][PDF][Abstract]@techreport { beckmann:1998:report, author = {Renate Beckmann, J\"urgen Herrmann}, title = {Memory Synthesis for General Purpose Computers by use of Constraint Logic Programming}, institution = {University of Dortmund, Dept. of CS XII}, year = {1998}, type = {Research Report}, number = {684}, month = {jul}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1998-TechReport-684.pdf}, confidential = {n}, abstract = {In modern computer systems the performance is dominated by the memory performance. Currently, there is neither a systematic design methodology nor a tool for designing memory systems for general purpose computers. We present a first approach to CAD support for this crucial subtask of system level design. Dependencies between influencing factors and design decisions are explicitly represented by constraints and constraint logic programming is used to make the design decisions. The memory design is optimized with respect to several objectives by iterating the (re)design cycle. Event driven simulation is used for evaluation of the intermediate results. The system is organized as an interactive design assistant. ZusammenfassungDie Leistung allgemeiner Rechner wird heute zunehmend durch deren Speicher bestimmt. Der Entwurf solcher Speicher wird bis jetzt werder durch eine strukturierte Vorgehensweise noch durch Werkzeuge unterst\"utzt. In diesem Papier wird eine erste Werkzeugunterst\"utzung f\"ur den Speicherentwurf auf System-Ebene vorgestellt. Abh\"angigkeiten zwischen den Einflu\"sfaktoren und den Entwurfsentscheidungen werden dabei als Constraints dargestellt, um mit Hilfe der Constraint-Logikprogrammierung Entwurfentscheidungen zu treffen. Der Speicherentwurfs wird entsprechend verschiedener Zielkriterien iterativ durch Redesign optimiert. Ein Entwurf wird mittels Ereignis-gesteuerter Simulation bewertet. Das Werkzeug ist als interaktiver Design-Assistent konzipiert.}, }In modern computer systems the performance is dominated by the memory performance. Currently, there is neither a systematic design methodology nor a tool for designing memory systems for general purpose computers. We present a first approach to CAD support for this crucial subtask of system level design. Dependencies between influencing factors and design decisions are explicitly represented by constraints and constraint logic programming is used to make the design decisions. The memory design is optimized with respect to several objectives by iterating the (re)design cycle. Event driven simulation is used for evaluation of the intermediate results. The system is organized as an interactive design assistant. ZusammenfassungDie Leistung allgemeiner Rechner wird heute zunehmend durch deren Speicher bestimmt. Der Entwurf solcher Speicher wird bis jetzt werder durch eine strukturierte Vorgehensweise noch durch Werkzeuge unterstützt. In diesem Papier wird eine erste Werkzeugunterstützung für den Speicherentwurf auf System-Ebene vorgestellt. Abhängigkeiten zwischen den Einflußfaktoren und den Entwurfsentscheidungen werden dabei als Constraints dargestellt, um mit Hilfe der Constraint-Logikprogrammierung Entwurfentscheidungen zu treffen. Der Speicherentwurfs wird entsprechend verschiedener Zielkriterien iterativ durch Redesign optimiert. Ein Entwurf wird mittels Ereignis-gesteuerter Simulation bewertet. Das Werkzeug ist als interaktiver Design-Assistent konzipiert. Rainer Dömer Peter Marwedel.Built-in Chaining: Introducing Complex Components into Architectural Synthesis.Technical Report #611, TU Dortmund, Faculty of Computer Science 12 April 1996, DOI: 10.13140/RG.2.1.1291.1760 [BibTeX][PDF]@techreport { marwedel:1996:611, author = {Peter Marwedel, Rainer D\"omer}, title = {Built-in Chaining: Introducing Complex Components into Architectural Synthesis}, institution = {TU Dortmund, Faculty of Computer Science 12}, year = {1996}, number = {611}, month = {April}, note = {DOI: 10.13140/RG.2.1.1291.1760 }, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1996-TechReport-611.pdf}, confidential = {n}, } Rainer Dömer Peter Marwedel.Introducing Complex Components into Architectural Synthesis.Internal report #611, University of Dortmund, Dept. of CS XII 1996[BibTeX][PDF][Abstract]@techreport { marwedel:1996:tech, author = {Peter Marwedel, Rainer D\"omer}, title = {Introducing Complex Components into Architectural Synthesis}, institution = {University of Dortmund, Dept. of CS XII}, year = {1996}, type = {Internal report}, number = {611}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1996-TechReport-611.pdf}, confidential = {n}, abstract = {In this paper, we extend the set of library components which are usually considered in architectural synthesis by components with built-in chaining. For such components, the result of some internally computed arithmetic function is made available as an argument to some other function through a local connection. These components can be used to implement chaining in a data-path in a single component. Components with built-in chaining are combinatorial circuits. They correspond to complex gates'' in logic synthesis. If compared to implementations with several components, components with built-in chaining usually provide a denser layout, reduced power consumption, and a shorter delay time. Multiplier/accumulators are the most prominent example of such components. Such components require new approaches for library mapping in architectural synthesis. In this paper, we describe an ILP-based approach taken in our OSCAR synthesis system.}, }In this paper, we extend the set of library components which are usually considered in architectural synthesis by components with built-in chaining. For such components, the result of some internally computed arithmetic function is made available as an argument to some other function through a local connection. These components can be used to implement chaining in a data-path in a single component. Components with built-in chaining are combinatorial circuits. They correspond to "complex gates" in logic synthesis. If compared to implementations with several components, components with built-in chaining usually provide a denser layout, reduced power consumption, and a shorter delay time. Multiplier/accumulators are the most prominent example of such components. Such components require new approaches for library mapping in architectural synthesis. In this paper, we describe an ILP-based approach taken in our OSCAR synthesis system. Ralf Niemann.Hardware/Software Partitioning using Integer Programming.Technical Report #586, TU Dortmund, Faculty of Computer Science 12 1995[BibTeX][PDF][Abstract]@techreport { niemann:1995:report, author = {Niemann, Ralf}, title = {Hardware/Software Partitioning using Integer Programming}, institution = {TU Dortmund, Faculty of Computer Science 12}, year = {1995}, type = {Technical Report}, number = {586}, keywords = {hwsw}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1995-TechReport-586.pdf}, confidential = {n}, abstract = {One of the key problems in hardware/software codesign is hardware/software partitioning. This paper describes a new approach to hardware/software partitioning using integer programming (IP). The advantage of using IP is that optimal results are calculated respective to the chosen objective function. The partitioning approach works fully automatic and supports multi-processor systems, interfacing and hardware sharing. In contrast to other approaches where special estimators are used, we use compilation and synthesis tools for cost estimation. The increased time for calculating the cost metrics is compensated by an improved quality of the estimations compared to the results of estimators. Therefore fewer iteration steps of partitioning are needed. The paper will show that using integer programming to solve the hardware/software partitioning problem is feasible and leads to promising results.}, }One of the key problems in hardware/software codesign is hardware/software partitioning. This paper describes a new approach to hardware/software partitioning using integer programming (IP). The advantage of using IP is that optimal results are calculated respective to the chosen objective function. The partitioning approach works fully automatic and supports multi-processor systems, interfacing and hardware sharing. In contrast to other approaches where special estimators are used, we use compilation and synthesis tools for cost estimation. The increased time for calculating the cost metrics is compensated by an improved quality of the estimations compared to the results of estimators. Therefore fewer iteration steps of partitioning are needed. The paper will show that using integer programming to solve the hardware/software partitioning problem is feasible and leads to promising results. Steven Bashford.Code Generation Techniques for Irregular Architectures.Technical Report #596, University of Dortmund, Dept. of CS XII 1995[BibTeX][PDF][Abstract]@techreport { bashford:1995:report, author = {Bashford, Steven}, title = {Code Generation Techniques for Irregular Architectures}, institution = {University of Dortmund, Dept. of CS XII}, year = {1995}, type = {Technical Report}, number = {596}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1995-TechReport-596.pdf}, confidential = {n}, abstract = {The fast development of many different ASIPs make demands of rapid availability of dedicated compilers. Fast retargeting is a major aspect, while fast compilation times are of minor importance. There are also new demands in the quality of the generated code. Irregular properties together with fine-grain parallelism given by a target architecture have to be effectively supported by the compiler. This report is focused on the traditional tasks of code generation - code selection, register allocation, and instruction scheduling. The major subject is to expose the tendencies of research of code generation techniques in recent years, and survey their features with regards to support for irregular architectures, fine-grain parallelism, retargetability, and phase coupling. The report outlines the preferable techniques involved in code generators. Features of irregular architectures being sufficiently supported by these techniques are examined. The insufficiencies with regards to irregular architectures are described and approaches to overcome them are described. The essential problems arising are due to mutual dependencies among the tasks of code generation. Thus, phase ordering problems and phase coupling approaches are a very important issue of the report. Retargeting is discussed with regards to retargetability of the described techniques, but also with regards to the quality of the generated code. Relations of structural and behavioral models are exposed, addressing the issue of supporting both, the design process of the target architecture and effective retargeting of all tasks of code generation.}, }The fast development of many different ASIPs make demands of rapid availability of dedicated compilers. Fast retargeting is a major aspect, while fast compilation times are of minor importance. There are also new demands in the quality of the generated code. Irregular properties together with fine-grain parallelism given by a target architecture have to be effectively supported by the compiler. This report is focused on the traditional tasks of code generation - code selection, register allocation, and instruction scheduling. The major subject is to expose the tendencies of research of code generation techniques in recent years, and survey their features with regards to support for irregular architectures, fine-grain parallelism, retargetability, and phase coupling. The report outlines the preferable techniques involved in code generators. Features of irregular architectures being sufficiently supported by these techniques are examined. The insufficiencies with regards to irregular architectures are described and approaches to overcome them are described. The essential problems arising are due to mutual dependencies among the tasks of code generation. Thus, phase ordering problems and phase coupling approaches are a very important issue of the report. Retargeting is discussed with regards to retargetability of the described techniques, but also with regards to the quality of the generated code. Relations of structural and behavioral models are exposed, addressing the issue of supporting both, the design process of the target architecture and effective retargeting of all tasks of code generation. Peter Marwedel, Steven Bashford, Rainer Dömer, Birger Landwehr and Ingolf Markhof.A Technique for Avoiding Isomorphic Netlists in Architectural Synthesis.Technical Report ##95-28, University of California at Irvine August 1995[BibTeX][PDF][Abstract]@techreport { marwedel:1995:irvine, author = {Marwedel, Peter and Bashford, Steven and D\"omer, Rainer and Landwehr, Birger and Markhof, Ingolf}, title = {A Technique for Avoiding Isomorphic Netlists in Architectural Synthesis}, institution = {University of California at Irvine}, year = {1995}, type = {Technical Report}, number = {#95-28}, month = {August}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1995-irvine_tr_95_28.pdf}, confidential = {n}, abstract = {Register-Transfer (RT-) level netlists are said to be isomorphic if they can be made identical by relabelling RT-components. RT-netlists can be generated by architectural synthesis. In order to consider just the essential design decisions, architectural synthesis should consider only a single representative of sets of isomorphic netlists. Nevertheless, many current synthesis algorithms do not take advantage of this potential reduction in search space. This is especially true for approaches which focus on optimizing the wiring between resource instances. In this paper, we are using netlist isomorphism for the very first time in architectural synthesis. Furthermore, we describe how an integer-programming (IP-) based synthesis technique can be extended to take advantage of netlist isomorphism. As a result, the running time required for synthesis is reduced.}, }Register-Transfer (RT-) level netlists are said to be isomorphic if they can be made identical by relabelling RT-components. RT-netlists can be generated by architectural synthesis. In order to consider just the essential design decisions, architectural synthesis should consider only a single representative of sets of isomorphic netlists. Nevertheless, many current synthesis algorithms do not take advantage of this potential reduction in search space. This is especially true for approaches which focus on optimizing the wiring between resource instances. In this paper, we are using netlist isomorphism for the very first time in architectural synthesis. Furthermore, we describe how an integer-programming (IP-) based synthesis technique can be extended to take advantage of netlist isomorphism. As a result, the running time required for synthesis is reduced. Anshul Kumar M. Balakrishnan.Optimal Clock Period for Synthesized Data Paths.Technical Report #547, University of Dortmund, Dept. of CS XII April 1995[BibTeX][PDF][Abstract]@techreport { balakrishnan:1995:report, author = {M. Balakrishnan, Anshul Kumar}, title = {Optimal Clock Period for Synthesized Data Paths}, institution = {University of Dortmund, Dept. of CS XII}, year = {1995}, type = {Technical Report}, number = {547}, month = {apr}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1995-TechReport-574.pdf}, confidential = {n}, abstract = {The choice of a clock period in designs with multicycle operations have a major influence on operator allocation as well as execution time. For technologies with significant interconnection delays, optimal clock period selection before/during high-level synthesis is not practical. In our approach, we start with a synthesized RTL data path structure, perform place and route and back-annotate the interconnection delays. First a bound flow graph is constructed by reflecting the allocation and binding information on the data flow graph. All potentially critical paths in this bound flow graph are identified. Execution time is computed by evaluating these path lengths and thus avoiding rescheduling. Based on execution times, a set of potentially optimal clock periods is chosen. An optimal clock period is one which results in the minimum execution time while meeting a controller cost constraint. Finally, the controller costs at these clock periods along with the execution times decide the optimal clock period. Extensive experimental results on data paths synthesized from high-level synthesis benchmarks establish both the utility as well as the efficiency of our approach. These results clearly show that choosing a clock period to minimize the "dead time" of the multicycle operators can improve the circuit performance by upto 10\% or even more. Apart from presenting a methodology to decide the clock period, the report introduces a novel way of representing and interpreting binding information (operation-operator and value-register) which may have other interesting applications.}, }The choice of a clock period in designs with multicycle operations have a major influence on operator allocation as well as execution time. For technologies with significant interconnection delays, optimal clock period selection before/during high-level synthesis is not practical. In our approach, we start with a synthesized RTL data path structure, perform place and route and back-annotate the interconnection delays. First a bound flow graph is constructed by reflecting the allocation and binding information on the data flow graph. All potentially critical paths in this bound flow graph are identified. Execution time is computed by evaluating these path lengths and thus avoiding rescheduling. Based on execution times, a set of potentially optimal clock periods is chosen. An optimal clock period is one which results in the minimum execution time while meeting a controller cost constraint. Finally, the controller costs at these clock periods along with the execution times decide the optimal clock period. Extensive experimental results on data paths synthesized from high-level synthesis benchmarks establish both the utility as well as the efficiency of our approach. These results clearly show that choosing a clock period to minimize the "dead time" of the multicycle operators can improve the circuit performance by upto 10% or even more. Apart from presenting a methodology to decide the clock period, the report introduces a novel way of representing and interpreting binding information (operation-operator and value-register) which may have other interesting applications. Renate Beckmann.Entwurfsentscheidungen und Einflußfaktoren der Speichersynthese für allgemeine Prozessor-Systeme..Research Report #546, University of Dortmund, Dept. of CS XII September 1994[BibTeX][PDF][Abstract]@techreport { beckmann:1994:report, author = {Beckmann, Renate}, title = {Entwurfsentscheidungen und Einflu\"sfaktoren der Speichersynthese f\"ur allgemeine Prozessor-Systeme.}, institution = {University of Dortmund, Dept. of CS XII}, year = {1994}, type = {Research Report}, number = {546}, month = {sep}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1994-TechReport-546.pdf}, confidential = {n}, abstract = {This report deals with the memory synthesis for general purpose processors. The performance of such memory systems highly depends on the access behavior of the application programs, which, in turn depends on the application domain and the host processor system. Processor memory synthesis has to design a memory hierarchy for a given appliation domain and a given host processor system, which has an optimal performance relative to access time and costs. Due to the increasing complexity of memory systems a tool support is necessary. This report describes the memory design decisions as well as the factors influencing an optimal memory design, and the relations between them. Zusammenfassung: Dieser Bericht befa\"st sich mit der Synthese von Speichern f\"ur allgemeine Prozessor-Systeme. Die Qualit\"at solcher Speicher h\"angt sehr von den Zugriffseigenschaften der Anwendungsprogramme ab, die wiederum von dem Anwendungsbereich und dem entsprechenden Prozessor-System abh\"angen. Die Aufgabe dieser Speichersynthese ist es, f\"ur gegebene Anwendungsbereiche auf einem bestimm ten Rechner-System (mit einem oder mehreren Prozessoren) eine optimale Speicherhierarchie (Cache, Hauptspeicher, usw.) hinsichtlich Zugriffszeit und Kosten zu entwerfen. Da heutige Speichersysteme zunehmend komplexer und ausgefeilter werden, wird f\"ur deren Entwurf Werkzeugunterst\"utzung notwendig. Hier werden die einzelnen Entwurfsentscheidungen sowie die verschiedenen Faktoren, die einen optimalen Speicherentwurf beeinflussen und entsprechende Zusammenh\"ange n\"aher beschrieben.}, }This report deals with the memory synthesis for general purpose processors. The performance of such memory systems highly depends on the access behavior of the application programs, which, in turn depends on the application domain and the host processor system. Processor memory synthesis has to design a memory hierarchy for a given appliation domain and a given host processor system, which has an optimal performance relative to access time and costs. Due to the increasing complexity of memory systems a tool support is necessary. This report describes the memory design decisions as well as the factors influencing an optimal memory design, and the relations between them. Zusammenfassung: Dieser Bericht befaßt sich mit der Synthese von Speichern für allgemeine Prozessor-Systeme. Die Qualität solcher Speicher hängt sehr von den Zugriffseigenschaften der Anwendungsprogramme ab, die wiederum von dem Anwendungsbereich und dem entsprechenden Prozessor-System abhängen. Die Aufgabe dieser Speichersynthese ist es, für gegebene Anwendungsbereiche auf einem bestimm ten Rechner-System (mit einem oder mehreren Prozessoren) eine optimale Speicherhierarchie (Cache, Hauptspeicher, usw.) hinsichtlich Zugriffszeit und Kosten zu entwerfen. Da heutige Speichersysteme zunehmend komplexer und ausgefeilter werden, wird für deren Entwurf Werkzeugunterstützung notwendig. Hier werden die einzelnen Entwurfsentscheidungen sowie die verschiedenen Faktoren, die einen optimalen Speicherentwurf beeinflussen und entsprechende Zusammenhänge näher beschrieben. Rainer Doemer Birger Landwehr.OSCAR: Optimum Simultaneous Scheduling, Allocation and Resource Binding Based on Integer Programming..Internal report #484, University of Dortmund, Dept. of CS XII 1994[BibTeX][PDF][Abstract]@techreport { landwehr:1994:report, author = {Birger Landwehr, Rainer Doemer}, title = {OSCAR: Optimum Simultaneous Scheduling, Allocation and Resource Binding Based on Integer Programming.}, institution = {University of Dortmund, Dept. of CS XII}, year = {1994}, type = {Internal report}, number = {484}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1994-TechReport-484.pdf}, confidential = {n}, abstract = {In this report we describe an IP-model based high-level synthesis system. In contrast to other approaches, the presented IP-model allows solving all three subtasks of high-level synthesis (scheduling, allocation and binding) simultaneously. As a result, designs which are optimal with respect to the cost function are generated. The model is able to exploit large component libraries with multi-functional units and complex components such as multiplier-accumulators. Additionally, the model is capable of handling mixed speeds and chaining in its general form. Applying algebraic transformations helps to exploit underlying component libraries more efficiently than other HLS-systems.}, }In this report we describe an IP-model based high-level synthesis system. In contrast to other approaches, the presented IP-model allows solving all three subtasks of high-level synthesis (scheduling, allocation and binding) simultaneously. As a result, designs which are optimal with respect to the cost function are generated. The model is able to exploit large component libraries with multi-functional units and complex components such as multiplier-accumulators. Additionally, the model is capable of handling mixed speeds and chaining in its general form. Applying algebraic transformations helps to exploit underlying component libraries more efficiently than other HLS-systems. Steven Bashford, Ulrich Bieker, Berthold Harking, Rainer Leupers, Peter Marwedel, Andreas Neumann and Dietmar Voggenauer.The MIMOLA Language Version 4.1.Technical Report, Fakultät für Informatik, TU Dortmund September 1994[BibTeX]@techreport { marwedel:94:mimolaref, author = {Bashford, Steven and Bieker, Ulrich and Harking, Berthold and Leupers, Rainer and Marwedel, Peter and Neumann, Andreas and Voggenauer, Dietmar}, title = {The MIMOLA Language Version 4.1}, institution = {Fakult\"at f\"ur Informatik, TU Dortmund}, year = {1994}, month = {September}, confidential = {n}, } Wolfgang Schenk Rainer Leupers.Retargetable Assembly Code Generation by Bootstrapping.Technical Report #488, University of Dortmund, Dept. of CS XII 1993[BibTeX][PDF][Abstract]@techreport { leupers:1993:report, author = {Rainer Leupers, Wolfgang Schenk}, title = {Retargetable Assembly Code Generation by Bootstrapping}, institution = {University of Dortmund, Dept. of CS XII}, year = {1993}, type = {Technical Report}, number = {488}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1993-TechReport-488.pdf}, confidential = {n}, abstract = {In a hardware/software codesign environment compilers are needed that map software components of a partitioned system behavior description onto a programmable processor. Since the processor structure is not static, but can repeatedly change during the design process, the compiler should be retargetable to avoid manual compiler adaption for any alternative architecture. A restriction of existing retargetable compilers is that they only generate microcode for the target architecture instead of machine-level code. In this paper we introduce a bootstrapping technique allowing us to translate high-level language (HLL) programs into real machine-level code using a retargetable microcode compiler. The retargetability is preserved, permitting to compare different architectural alternatives in a codesign framework within relatively little time. As an application of the new code generation technique we consider hardware/software codesign of heterogeneous information processing systems.}, }In a hardware/software codesign environment compilers are needed that map software components of a partitioned system behavior description onto a programmable processor. Since the processor structure is not static, but can repeatedly change during the design process, the compiler should be retargetable to avoid manual compiler adaption for any alternative architecture. A restriction of existing retargetable compilers is that they only generate microcode for the target architecture instead of machine-level code. In this paper we introduce a bootstrapping technique allowing us to translate high-level language (HLL) programs into real machine-level code using a retargetable microcode compiler. The retargetability is preserved, permitting to compare different architectural alternatives in a codesign framework within relatively little time. As an application of the new code generation technique we consider hardware/software codesign of heterogeneous information processing systems. Peter Marwedel.MSSV: Tree-Based Mapping of Algorithms to Predefined Structures (Extended Version).Technical Report #431, University of Dortmund, Dept. of CS XII January 1993[BibTeX][PDF][Abstract]@techreport { marwedel:1993:report, author = {Marwedel, Peter}, title = {MSSV: Tree-Based Mapping of Algorithms to Predefined Structures (Extended Version)}, institution = {University of Dortmund, Dept. of CS XII}, year = {1993}, type = {Technical Report}, number = {431}, month = {jan}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1993-TechReport-431.pdf}, confidential = {n}, abstract = {Due to the need for fast design cycles and low production cost, programmable circuits like FPGAs and DSP processors (henceforth called target structures) are becoming increasingly popular. Design planning, detailed design as well as updating such designs requires mapping existing algorithms onto these circuits. Instead of writing target-specific mappers, we propose using retargetable mappers. The technique reported is this paper is based on pattern matching. Binary code is generated as a result of this matching process. This paper describes the essential techniques of our mapper MSSV and identifies areas for improvements. As a result, it shows that efficient handling of alternative mappings is crucial for an acceptable performance. This report is also intended as a reference for new developments.}, }Due to the need for fast design cycles and low production cost, programmable circuits like FPGAs and DSP processors (henceforth called target structures) are becoming increasingly popular. Design planning, detailed design as well as updating such designs requires mapping existing algorithms onto these circuits. Instead of writing target-specific mappers, we propose using retargetable mappers. The technique reported is this paper is based on pattern matching. Binary code is generated as a result of this matching process. This paper describes the essential techniques of our mapper MSSV and identifies areas for improvements. As a result, it shows that efficient handling of alternative mappings is crucial for an acceptable performance. This report is also intended as a reference for new developments. R. Beckmann, D. Pusch, R. Johnke, Peter Marwedel and W. Schenk.The MIMOLA Language Reference Manual.Technical Report, Fakultät für Informatik, TU Dortmund 1993[BibTeX]@techreport { marwedel:93:mimolaref, author = {Beckmann, R. and Pusch, D. and Johnke, R. and Marwedel, Peter and Schenk, W.}, title = {The MIMOLA Language Reference Manual}, institution = {Fakult\"at f\"ur Informatik, TU Dortmund}, year = {1993}, confidential = {n}, } Ulrich Bieker.On the Semantics of the TREEMOLA Language Version 4.0..Research Report #435, University of Dortmund, Dept. of CS XII July 1992[BibTeX][PDF][Abstract]@techreport { bieker:1992:report, author = {Bieker, Ulrich}, title = {On the Semantics of the TREEMOLA Language Version 4.0.}, institution = {University of Dortmund, Dept. of CS XII}, year = {1992}, type = {Research Report}, number = {435}, month = {jul}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1992-TechReport-435.pdf}, confidential = {n}, abstract = {Usually the semantics of a hardware description language is either implicitly given by a simulator or is in the mind of the designer of the language. Therefore, a good documentation or a formal semantic definition is of great importance for every user of the language. This report is intended to fill this gap in the context of MIMOLA. MIMOLA is a computer hardware description language (CHDL) which has been influenced by other hardware description languages like VHDL and DACAPO. TREEMOLA is the language that is used to exchange design data between different CAD-tools in the MIMOLA hardware design system MSS. Using first order predicate calculus a formal semantic definition is obtained for a subset of the intermediate language TREEMOLA.}, }Usually the semantics of a hardware description language is either implicitly given by a simulator or is in the mind of the designer of the language. Therefore, a good documentation or a formal semantic definition is of great importance for every user of the language. This report is intended to fill this gap in the context of MIMOLA. MIMOLA is a computer hardware description language (CHDL) which has been influenced by other hardware description languages like VHDL and DACAPO. TREEMOLA is the language that is used to exchange design data between different CAD-tools in the MIMOLA hardware design system MSS. Using first order predicate calculus a formal semantic definition is obtained for a subset of the intermediate language TREEMOLA. R. Johnk and Peter Marwedel.The MIMOLA Language Reference Manual.Technical Report, Fakultät für Informatik, TU DortmundUniversität Kiel, 1989[BibTeX]@techreport { marwedel:83:mimolaref, author = {Johnk, R. and Marwedel, Peter}, title = {The MIMOLA Language Reference Manual}, institution = {Fakult\"at f\"ur Informatik, TU Dortmund}, year = {1989}, address = {Universit\"at Kiel}, confidential = {n}, } Reinhard Jöhnk and Peter Marwedel.MIMOLA Reference Manual - Version 3.45.Technical Report, Institut für Informatik und Praktische Mathematik, Universität Kiel March 1989[BibTeX]@techreport { marwedel:89:mimolaref, author = {J\"ohnk, Reinhard and Marwedel, Peter}, title = {MIMOLA Reference Manual - Version 3.45}, institution = {Institut f\"ur Informatik und Praktische Mathematik, Universit\"at Kiel}, year = {1989}, month = {March}, confidential = {n}, } M. Balakrishnan.RT-Level Synthesis based on Integrated Scheduling and Binding.Technical Report #8813, Institut für Informatik, Universität Kiel 1988[BibTeX][PDF][Abstract]@techreport { balakrishnan:1988, author = {Balakrishnan, M.}, title = {RT-Level Synthesis based on Integrated Scheduling and Binding}, institution = {Institut f\"ur Informatik, Universit\"at Kiel}, year = {1988}, type = {Technical Report}, number = {8813}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1988-Balakrishnan.pdf}, confidential = {n}, abstract = {Synthesis of digital systems, involves a number of tasks ranging from scheduling to generating interconnections. The interrelationship between these tasks implies that good designs can only be generated by considering the overall act of a design decision. The approach presented in this report provides a framework for integrating scheduling decisions with binding decisions. The methodology supports allocation of a wider mix of operator modules and covers the design space more effectively. The process itself can be described as incremental synthesis and is thus well-suited for applications involving partial presynthesised structures. Specifically, the report deals with the tasks of scheduling, binding operations to operators and intermediate values to storage units. Further, the generated structure is optimized by examining the feasibility of merging storage units to form memories. All the optimization tasks are modelled as constrained 0-1 integer programming problems with the objective of reducing interconnections.}, }Synthesis of digital systems, involves a number of tasks ranging from scheduling to generating interconnections. The interrelationship between these tasks implies that good designs can only be generated by considering the overall act of a design decision. The approach presented in this report provides a framework for integrating scheduling decisions with binding decisions. The methodology supports allocation of a wider mix of operator modules and covers the design space more effectively. The process itself can be described as incremental synthesis and is thus well-suited for applications involving partial presynthesised structures. Specifically, the report deals with the tasks of scheduling, binding operations to operators and intermediate values to storage units. Further, the generated structure is optimized by examining the feasibility of merging storage units to form memories. All the optimization tasks are modelled as constrained 0-1 integer programming problems with the objective of reducing interconnections. K.Kelle, G.Krüger, P.Marwedel, L.Nowak, L.Terasa and F.Wosnitza.Werkzeuge des MIMOLA-Hardware-Entwurfssystems.Technical Report #8707, Institut für Informatik und Praktische Mathematik, Universität Kiel 1987[BibTeX][PDF][Abstract]@techreport { kelle:1987:report, author = {K.Kelle, and G.Kr\"uger, and P.Marwedel, and L.Nowak, and L.Terasa, and F.Wosnitza,}, title = {Werkzeuge des MIMOLA-Hardware-Entwurfssystems}, institution = {Institut f\"ur Informatik und Praktische Mathematik, Universit\"at Kiel}, year = {1987}, type = {Technical Report}, number = {8707}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1987-Mimola-Werkzeuge.pdf}, confidential = {n}, abstract = {This report describes the tools of a CAD system supporting the design of digital hardware. The CAD system is based upon the MIMOLA hardware description language. PASCAL-like algorithms (or "behaviour") as well as hardware structures (netlists) can be described in MIMOLA. Main emphasis is an the register transfer level of abstraction, but application levels and the gate level are also covered. The system currently contains six main tools, a common tool environment and some auxiliary programs. The six main tools perform data/control path synthesis, automatic generation of self-test programs, retargetable microcode generation, schematics generation, simulation and testability analysis.}, }This report describes the tools of a CAD system supporting the design of digital hardware. The CAD system is based upon the MIMOLA hardware description language. PASCAL-like algorithms (or "behaviour") as well as hardware structures (netlists) can be described in MIMOLA. Main emphasis is an the register transfer level of abstraction, but application levels and the gate level are also covered. The system currently contains six main tools, a common tool environment and some auxiliary programs. The six main tools perform data/control path synthesis, automatic generation of self-test programs, retargetable microcode generation, schematics generation, simulation and testability analysis. Peter Marwedel.Ein Software-System zur Synthese von Rechnerstrukturen und zur Erzeugung von Microcode.habilitation thesis, Universität Kiel 1985[BibTeX][PDF][Abstract]@techreport { marwedel:1985:habil, author = {Marwedel, Peter}, title = {Ein Software-System zur Synthese von Rechnerstrukturen und zur Erzeugung von Microcode}, institution = {Universit\"at Kiel}, year = {1985}, type = {habilitation thesis}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1985-habilitation-Marwedel.pdf}, confidential = {n}, abstract = {Aufgrund der Fortschritte im Bereich der Halbleitertechnologie werden immer komplexere integrierte Schaltkreise hergestellt. Als Folge davon mu\"s der Entwurf dieser Schaltkreise von immer h\"oheren Spezifikationsebenen ausgehen. W\"ahrend die Spezifikation der ersten gefertigten Halbleiter, n\"amlich der Einzeltransistoren, nur elektrische Eigenschaften beinhaltete, so umfa\"st diese heute beim Entwurf von Mikroprozessoren ganze Maschinenbefehlss\"atze...}, }Aufgrund der Fortschritte im Bereich der Halbleitertechnologie werden immer komplexere integrierte Schaltkreise hergestellt. Als Folge davon muß der Entwurf dieser Schaltkreise von immer höheren Spezifikationsebenen ausgehen. Während die Spezifikation der ersten gefertigten Halbleiter, nämlich der Einzeltransistoren, nur elektrische Eigenschaften beinhaltete, so umfaßt diese heute beim Entwurf von Mikroprozessoren ganze Maschinenbefehlssätze... Peter Marwedel.Hardware Allocation for Horizontal Microinstructions in the MIMOLA Software System.Technical Report #5/80, Institut für Informatik und Praktische MathematikUniversität Kiel, 1979[BibTeX]@techreport { marw:79:hori, author = {Marwedel, Peter}, title = {Hardware Allocation for Horizontal Microinstructions in the MIMOLA Software System}, institution = {Institut f\"ur Informatik und Praktische Mathematik}, year = {1979}, number = {5/80}, address = {Universit\"at Kiel}, confidential = {n}, } G.Zimmermann P.Marwedel.MIMOLA-Report Revision 1 and MIMOLA Software System User Manual.Technical Report #2, Institut für Informatik und Praktische Mathematik, Universität Kiel 1979[BibTeX][PDF][Abstract]@techreport { marwedel:1979:xyz, author = {P.Marwedel, G.Zimmermann}, title = {MIMOLA-Report Revision 1 and MIMOLA Software System User Manual}, institution = {Institut f\"ur Informatik und Praktische Mathematik, Universit\"at Kiel}, year = {1979}, type = {Technical Report}, number = {2}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1979-Mimola-Man.pdf}, confidential = {n}, abstract = {This Revision 1 replaces the original report (1). The syntax of MIMOLA has been slightly changed and extended. The hardware declaration and assignment parts have been redesigned. A MACRO feature has been added. A description of the hardware database and the control language of the MSS (MIMOLA Software System) has been added (2,3,4). MIMOLA is a computer hardware description language (CHDL) and a programming.}, }This Revision 1 replaces the original report (1). The syntax of MIMOLA has been slightly changed and extended. The hardware declaration and assignment parts have been redesigned. A MACRO feature has been added. A description of the hardware database and the control language of the MSS (MIMOLA Software System) has been added (2,3,4). MIMOLA is a computer hardware description language (CHDL) and a programming. Peter Marwedel.Ein praktisches Verfahren zum Entwurf synchroner Schaltwerke.Technical Report #1, Institut für Informatik und Praktische Mathematik, Universität Kiel 1977[BibTeX][PDF][Abstract]@techreport { marwedel:1977:report, author = {Marwedel, Peter}, title = {Ein praktisches Verfahren zum Entwurf synchroner Schaltwerke}, institution = {Institut f\"ur Informatik und Praktische Mathematik, Universit\"at Kiel}, year = {1977}, type = {Technical Report}, number = {1}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1977-Report-Kiel.pdf}, confidential = {n}, abstract = {Wenn man sich mit dem praktischeb Entwurf synchroner Schaltwerke besch\"aftigt, so hat man bislang im wesentlichen zwei M\"oglichkeiten: 1. Von einigen Flipflop und Gattern ausgehend kostruiert man sich ein Schaltwerk...}, }Wenn man sich mit dem praktischeb Entwurf synchroner Schaltwerke beschäftigt, so hat man bislang im wesentlichen zwei Möglichkeiten: 1. Von einigen Flipflop und Gattern ausgehend kostruiert man sich ein Schaltwerk...
 Mikail Yayla, Anas Toma, Jan Eric Lenssen, Victoria Shpacovitch, Kuan-Hsun Chen, Frank Weichert and Jian-Jia Chen.Resource-Efficient Nanoparticle Classification Using Frequency Domain Analysis. In BVM WorkshopLübeck, Germany, March 2019[BibTeX][Abstract]@inproceedings { Yayla-BVM2019, author = {Yayla, Mikail and Toma, Anas and Lenssen, Jan Eric and Shpacovitch, Victoria and Chen, Kuan-Hsun and Weichert, Frank and Chen, Jian-Jia}, title = {Resource-Efficient Nanoparticle Classification Using Frequency Domain Analysis}, booktitle = {BVM Workshop}, year = {2019}, address = {L\"ubeck, Germany}, month = {March}, keywords = {kuan}, confidential = {n}, abstract = {We present a method for resource-efficient classification of nanoparticles such as viruses in liquid or gas samples by analyzing Surface Plasmon Resonance (SPR) images using frequency domain features. The SPR images are obtained with the Plasmon Assisted Microscopy Of Nano-sized Objects (PAMONO) biosensor, which was developed as a mobile virus and particle detector. Convolutional neural network (CNN) solutions are available for the given task, but since the mobility of the sensor is an important factor, we provide a faster and less resource demanding alternative approach for the use in a small virus detection device.The execution time of our approach, which can be optimized further using low power hardware such as a digital signal processor (DSP), is at least 2.6 times faster than the current CNN solution while sacrificing only 1 to 2.5 percent points in accuracy.}, }We present a method for resource-efficient classification of nanoparticles such as viruses in liquid or gas samples by analyzing Surface Plasmon Resonance (SPR) images using frequency domain features. The SPR images are obtained with the Plasmon Assisted Microscopy Of Nano-sized Objects (PAMONO) biosensor, which was developed as a mobile virus and particle detector. Convolutional neural network (CNN) solutions are available for the given task, but since the mobility of the sensor is an important factor, we provide a faster and less resource demanding alternative approach for the use in a small virus detection device.The execution time of our approach, which can be optimized further using low power hardware such as a digital signal processor (DSP), is at least 2.6 times faster than the current CNN solution while sacrificing only 1 to 2.5 percent points in accuracy. Kuan-Hsun Chen, Niklas Ueter, Georg von der Brüggen and Jian-Jia Chen.Efficient Computation of Deadline-Miss Probability and Parametric Remedies for Potential Pitfalls. In Design, Automation and Test in Europe (DATE)Florence, Italy, 25-29th, March 2019[BibTeX][PDF]@inproceedings { khchenDATE2019, author = {Chen, Kuan-Hsun and Ueter, Niklas and Br\"uggen, Georg von der and Chen, Jian-Jia}, title = {Efficient Computation of Deadline-Miss Probability and Parametric Remedies for Potential Pitfalls}, booktitle = {Design, Automation and Test in Europe (DATE)}, year = {2019}, address = {Florence, Italy}, month = {25-29th, March}, keywords = {kuan}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/kuan2019date.pdf}, confidential = {n}, } Helena Kotthaus, Lea Schönberger, Andreas Lang, Jian-Jia Chen and Peter Marwedel.Can Flexible Multi-Core Scheduling Help to Execute Machine Learning Algorithms Resource-Efficiently?. In 22nd International Workshop on Software and Compilers for Embedded Systems, pages 59--62 2019[BibTeX][Link]@inproceedings { kotthaus/2019b, author = {Kotthaus, Helena and Sch\"onberger, Lea and Lang, Andreas and Chen, Jian-Jia and Marwedel, Peter}, title = {Can Flexible Multi-Core Scheduling Help to Execute Machine Learning Algorithms Resource-Efficiently?}, booktitle = {22nd International Workshop on Software and Compilers for Embedded Systems}, year = {2019}, series = {SCOPES '19}, pages = {59--62}, publisher = {ACM}, url = {https://dl.acm.org/citation.cfm?id=3323986}, keywords = {Lea}, confidential = {n}, } Helena Kotthaus and Jan Vitek.Typical Mistakes in Data Science: Should you Trust my Model? . In Abstract Booklet of the International R User Conference (UseR!)Toulouse, France, July 2019[BibTeX][Link]@inproceedings { kotthaus/2019c, author = {Kotthaus, Helena and Vitek, Jan}, title = {Typical Mistakes in Data Science: Should you Trust my Model? }, booktitle = {Abstract Booklet of the International R User Conference (UseR!)}, year = {2019}, address = {Toulouse, France}, month = {July}, url = {http://www.user2019.fr/posters/}, confidential = {n}, } Anas Toma, Juri Wenner, Jan Eric Lenssen and Jian-Jia Chen.Adaptive Quality Optimization of Computer Vision Tasks in Resource-Constrained Devices using Edge Computing. In the 19th Annual IEEE/ACM International Symposium in Cluster, Cloud, and Grid Computing (CCGrid 2019)Larnaca, Cyprus, May 2019[BibTeX][Abstract]@inproceedings { Toma-CCGrid2019, author = {Toma, Anas and Wenner, Juri and Lenssen, Jan Eric and Chen, Jian-Jia}, title = {Adaptive Quality Optimization of Computer Vision Tasks in Resource-Constrained Devices using Edge Computing}, booktitle = {the 19th Annual IEEE/ACM International Symposium in Cluster, Cloud, and Grid Computing (CCGrid 2019)}, year = {2019}, address = {Larnaca, Cyprus}, month = {May}, confidential = {n}, abstract = {This paper presents an approach to optimize the quality of computer vision tasks in resource-constrained devices by using different execution versions of the same task. The execution versions are generated by dropping irrelevant contents of the input images or other contents that have marginal effect on the quality of the result. Our execution model is designed to support the edge computing paradigm, where the tasks can be executed remotely on edge nodes either to improve the quality or to reduce the workload of the local device. We also propose an algorithm that selects the suitable execution versions, which includes the configuration and the location of the execution and maximizes the total quality of the tasks based on the available resources. The proposed approach provides reliable and adaptive task execution by using several execution versions with various performance and quality trade-offs. Therefore, it is very beneficial for systems with resource and timing constraints such as portable medical devices, surveillance video cameras, wearable systems, etc. The proposed algorithm is evaluated using different computer vision benchmarks.}, }This paper presents an approach to optimize the quality of computer vision tasks in resource-constrained devices by using different execution versions of the same task. The execution versions are generated by dropping irrelevant contents of the input images or other contents that have marginal effect on the quality of the result. Our execution model is designed to support the edge computing paradigm, where the tasks can be executed remotely on edge nodes either to improve the quality or to reduce the workload of the local device. We also propose an algorithm that selects the suitable execution versions, which includes the configuration and the location of the execution and maximizes the total quality of the tasks based on the available resources. The proposed approach provides reliable and adaptive task execution by using several execution versions with various performance and quality trade-offs. Therefore, it is very beneficial for systems with resource and timing constraints such as portable medical devices, surveillance video cameras, wearable systems, etc. The proposed algorithm is evaluated using different computer vision benchmarks. Lea Schönberger, Georg Brüggen, Horst Schirmeier and Jian-Jia Chen.Design Optimization for Hardware-Based Message Filters in Broadcast Buses. In Design, Automation and Test in Europe (DATE)Florence, Italy, March 25-29 2019[BibTeX][Abstract]@inproceedings { schoenbergerDATE2019, author = {Sch\"onberger, Lea and Br\"uggen, Georg and Schirmeier, Horst and Chen, Jian-Jia}, title = {Design Optimization for Hardware-Based Message Filters in Broadcast Buses}, booktitle = {Design, Automation and Test in Europe (DATE)}, year = {2019}, address = {Florence, Italy}, month = {March 25-29}, keywords = {lea, georg}, confidential = {n}, abstract = {In the field of automotive engineering, broadcast buses, e.g., Controller Area Network (CAN), are frequently used to connect multiple electronic control units (ECUs). Each message transmitted on such buses can be received by each single participant, but not all messages are relevant for every ECU. For this purpose, all incoming messages must be filtered in terms of relevance by either hardware or software techniques. We address the issue of designing hardware filter configurations for clients connected to a broadcast bus in order to reduce the cost, i.e., the computation overhead, provoked by undesired but accepted messages. More precisely, we propose an SMT formulation that can be applied to i) retrieve a (minimal) perfect filter configuration, i.e., no undesired messages are received, ii) optimize the filter quality under given hardware restrictions, or iii) minimize the hardware cost for a given type of filtercomponent and a maximum cost threshold.}, }In the field of automotive engineering, broadcast buses, e.g., Controller Area Network (CAN), are frequently used to connect multiple electronic control units (ECUs). Each message transmitted on such buses can be received by each single participant, but not all messages are relevant for every ECU. For this purpose, all incoming messages must be filtered in terms of relevance by either hardware or software techniques. We address the issue of designing hardware filter configurations for clients connected to a broadcast bus in order to reduce the cost, i.e., the computation overhead, provoked by undesired but accepted messages. More precisely, we propose an SMT formulation that can be applied to i) retrieve a (minimal) perfect filter configuration, i.e., no undesired messages are received, ii) optimize the filter quality under given hardware restrictions, or iii) minimize the hardware cost for a given type of filtercomponent and a maximum cost threshold. Nils Hölscher, Kuan-Hsun Chen, Georg von der Brüggen and Jian-Jia Chen.Examining and Supporting Multi-Tasking in EV3OSEK. In 14th annual workshop on Operating Systems Platforms for Embedded Real-Time applications (OSPERT 2018), Barcelona, SpainBarcelona, Spain, July 2018[BibTeX][PDF][Abstract]@inproceedings { Nils-OSPERT, author = {H\"olscher, Nils and Chen, Kuan-Hsun and Br\"uggen, Georg von der and Chen, Jian-Jia}, title = {Examining and Supporting Multi-Tasking in EV3OSEK}, booktitle = {14th annual workshop on Operating Systems Platforms for Embedded Real-Time applications (OSPERT 2018), Barcelona, Spain}, year = {2018}, address = {Barcelona, Spain}, month = {July}, keywords = {kuan, Georg}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2018-nils.pdf}, confidential = {n}, abstract = {Lego Mindstorms Robots are a popular platform for graduate level researches and college education purposes. As a portation of nxtOSEK, an OSEK standard compatible real-time operation system, EV3OSEK inherits the advantages of nxtOSEK for experiments on EV3, the latest generation of Mindstorms robots. Unfortunately, the current version of EV3OSEK still has some serious errors. In this work we address task preemption, a common feature desired in every RTOS. We reveal the errors in the current version and propose corresponding solutions for EV3OSEK that fix the errors in the IRQ-Handler and the task dispatching properly, thus enabling real multi-tasking on EV3OSEK. Our verifications show that the current design flaws are solved. Along with this work, we suggest that researchers who performed experiments on nxtOSEK should carefully examine if the flaws presented in this paper affect their results.}, }Lego Mindstorms Robots are a popular platform for graduate level researches and college education purposes. As a portation of nxtOSEK, an OSEK standard compatible real-time operation system, EV3OSEK inherits the advantages of nxtOSEK for experiments on EV3, the latest generation of Mindstorms robots. Unfortunately, the current version of EV3OSEK still has some serious errors. In this work we address task preemption, a common feature desired in every RTOS. We reveal the errors in the current version and propose corresponding solutions for EV3OSEK that fix the errors in the IRQ-Handler and the task dispatching properly, thus enabling real multi-tasking on EV3OSEK. Our verifications show that the current design flaws are solved. Along with this work, we suggest that researchers who performed experiments on nxtOSEK should carefully examine if the flaws presented in this paper affect their results. Zheng Dong, Cong Liu, Soroush Bateni, Kuan-Hsun Chen, Jian-Jia Chen, Georg von der Brüggen and Junjie Shi.Shared-Resource-Centric Limited Preemptive Scheduling: A Comprehensive Study of Suspension-based Partitioning Approaches. In IEEE Real-Time and Embedded Technology and Applications Symposium (RTAS) 2018[BibTeX][Abstract]@inproceedings { dong2018rtas, author = {Dong, Zheng and Liu, Cong and Bateni, Soroush and Chen, Kuan-Hsun and Chen, Jian-Jia and Br\"uggen, Georg von der and Shi, Junjie}, title = {Shared-Resource-Centric Limited Preemptive Scheduling: A Comprehensive Study of Suspension-based Partitioning Approaches}, booktitle = {IEEE Real-Time and Embedded Technology and Applications Symposium (RTAS)}, year = {2018}, keywords = {kuan, georg}, confidential = {n}, abstract = {This paper studies the problem of scheduling a set of hard real-time sporadic tasks that may access CPU cores and a shared resource. Motivated by the observation that the CPU resource is often abundant compared to the shared resources in multi-core and many-core systems, we propose to resolve this problem from a counter-intuitive shared-resource-centric perspective, focusing on judiciously prioritizing and scheduling tasks’ requests in a limited preemptive manner on the shared resource while viewing the worst-case latency a task may experience on the CPU cores as suspension delays. We develop a rather comprehensive set of task partitioning algorithms that partition tasks onto the shared resource with the objective of guaranteeing schedulability while minimizing the required size of the shared resource, which plays a critical role in reducing the overall cost and complexity of building resource-constrained embedded systems in many application domains. A GPU-based prototype case study and extensive simulation-based experiments have been conducted, which validate both our shared-resource-centric scheduling philosophy and the efficiency of our suspension-based partitioning solutions in practice. }, }This paper studies the problem of scheduling a set of hard real-time sporadic tasks that may access CPU cores and a shared resource. Motivated by the observation that the CPU resource is often abundant compared to the shared resources in multi-core and many-core systems, we propose to resolve this problem from a counter-intuitive shared-resource-centric perspective, focusing on judiciously prioritizing and scheduling tasks’ requests in a limited preemptive manner on the shared resource while viewing the worst-case latency a task may experience on the CPU cores as suspension delays. We develop a rather comprehensive set of task partitioning algorithms that partition tasks onto the shared resource with the objective of guaranteeing schedulability while minimizing the required size of the shared resource, which plays a critical role in reducing the overall cost and complexity of building resource-constrained embedded systems in many application domains. A GPU-based prototype case study and extensive simulation-based experiments have been conducted, which validate both our shared-resource-centric scheduling philosophy and the efficiency of our suspension-based partitioning solutions in practice. Jian-Jia Chen, Georg von der Brüggen and Niklas Ueter.Push Forward: Global Fixed-Priority Scheduling of Arbitrary-Deadline Sporadic Task Systems. In 30th Euromicro Conference on Real-Time Systems, {ECRTS} 2018, July 3-6, 2018, Barcelona, Spain, pages 8:1--8:24 2018[BibTeX][Link]@inproceedings { Chen2018ECRTS, author = {Chen, Jian-Jia and Br\"uggen, Georg von der and Ueter, Niklas}, title = {Push Forward: Global Fixed-Priority Scheduling of Arbitrary-Deadline Sporadic Task Systems}, booktitle = {30th Euromicro Conference on Real-Time Systems, {ECRTS} 2018, July 3-6, 2018, Barcelona, Spain}, year = {2018}, pages = {8:1--8:24}, url = {http://drops.dagstuhl.de/opus/volltexte/2018/8996/pdf/LIPIcs-ECRTS-2018-8.pdf}, keywords = {georg}, confidential = {n}, } Kuan-Hsun Chen, Georg von der Brüggen and Jian-Jia Chen.Analysis of Deadline Miss Rates for Uniprocessor Fixed-Priority Scheduling. In The 24th IEEE International Conference on Embedded and Real-Time Computing Systems and Applications (RTCSA)Hakodate, Japan, August 2018, Best Student Paper Award [BibTeX][PDF][Abstract]@inproceedings { khchenRTCSA18, author = {Chen, Kuan-Hsun and Br\"uggen, Georg von der and Chen, Jian-Jia}, title = {Analysis of Deadline Miss Rates for Uniprocessor Fixed-Priority Scheduling}, booktitle = {The 24th IEEE International Conference on Embedded and Real-Time Computing Systems and Applications (RTCSA)}, year = {2018}, address = {Hakodate, Japan}, month = {August}, note = { Best Student Paper Award }, keywords = {Georg, kuan}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2018-kuanrtcsa.pdf}, confidential = {n}, abstract = {Timeliness is an important feature for many embedded systems. Although soft real-time embedded systems can tolerate and allow certain deadline misses, it is still important to quantify them to justify whether the considered systems are acceptable. In this paper, we provide a way to safely over-approximate the expected deadline miss rate for a specific sporadic real-time task under fixed-priority preemptive scheduling in uniprocessor systems. Our approach is compatible with the existing results in the literature that calculate the probability of deadline misses either based on the convolution-based approaches or analytically. We demonstrate our approach by considering randomly generated task sets with an execution behavior that simulates jobs that are subjected to soft errors incurred by hardware transient faults under a given fault rate. To empirically gather the deadline miss rates, we implemented an event-based simulator with a fault-injection module and release the scripts. With extensive simulations under different fault rates, we evaluate the efficiency and the pessimism of our approach. The evaluation results show that our approach is effective to derive an upper bound of the expected deadline miss rate and efficient with respect to the required computation time.}, }Timeliness is an important feature for many embedded systems. Although soft real-time embedded systems can tolerate and allow certain deadline misses, it is still important to quantify them to justify whether the considered systems are acceptable. In this paper, we provide a way to safely over-approximate the expected deadline miss rate for a specific sporadic real-time task under fixed-priority preemptive scheduling in uniprocessor systems. Our approach is compatible with the existing results in the literature that calculate the probability of deadline misses either based on the convolution-based approaches or analytically. We demonstrate our approach by considering randomly generated task sets with an execution behavior that simulates jobs that are subjected to soft errors incurred by hardware transient faults under a given fault rate. To empirically gather the deadline miss rates, we implemented an event-based simulator with a fault-injection module and release the scripts. With extensive simulations under different fault rates, we evaluate the efficiency and the pessimism of our approach. The evaluation results show that our approach is effective to derive an upper bound of the expected deadline miss rate and efficient with respect to the required computation time. Mikail Yayla, Kuan-Hsun Chen and Jian-Jia Chen.Fault Tolerance on Control Applications: Empirical Investigations of Impacts from Incorrect Calculations. In 4th Workshop on Emerging Ideas and Trends in Engineering of Cyber-Physical Systems (EITEC) 2018[BibTeX][Link][Abstract]@inproceedings { Yayla2018EITEC, author = {Yayla, Mikail and Chen, Kuan-Hsun and Chen, Jian-Jia}, title = {Fault Tolerance on Control Applications: Empirical Investigations of Impacts from Incorrect Calculations}, booktitle = {4th Workshop on Emerging Ideas and Trends in Engineering of Cyber-Physical Systems (EITEC)}, year = {2018}, url = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2018-yayla-eitec.pdf}, keywords = {kuan}, confidential = {n}, abstract = {Due to aggressive technology downscaling, mobile and embedded systems are susceptible to transient faults in the underlying hardware. Transient faults may incur soft-errors or even lead to system failure. A recent study has proposed to exploit the concept of the (m,k)-firm real-time task model with compensation techniques to manage redundant executions, aiming to selectively protect the control application. In this work we provide an empirical approach to find the (m,k) robustness requirements. With the delivered (m,k) robustness requirements on path tracing and balance control tasks, we conduct comprehensive case studies to evaluate the effectiveness of the compensation techniques under different fault locations and fault rates. }, }Due to aggressive technology downscaling, mobile and embedded systems are susceptible to transient faults in the underlying hardware. Transient faults may incur soft-errors or even lead to system failure. A recent study has proposed to exploit the concept of the (m,k)-firm real-time task model with compensation techniques to manage redundant executions, aiming to selectively protect the control application. In this work we provide an empirical approach to find the (m,k) robustness requirements. With the delivered (m,k) robustness requirements on path tracing and balance control tasks, we conduct comprehensive case studies to evaluate the effectiveness of the compensation techniques under different fault locations and fault rates. Helena Kotthaus, Andreas Lang and Peter Marwedel.Optimizing Parallel R Programs via Dynamic Scheduling Strategies. In Abstract Booklet of the International R User Conference (UseR!)Brisbane, Australia, July 2018[BibTeX][Link]@inproceedings { kotthaus/2018a, author = {Kotthaus, Helena and Lang, Andreas and Marwedel, Peter}, title = {Optimizing Parallel R Programs via Dynamic Scheduling Strategies}, booktitle = {Abstract Booklet of the International R User Conference (UseR!)}, year = {2018}, address = {Brisbane, Australia}, month = {July}, url = {https://stat.ethz.ch/R-manual/R-devel/library/parallel/doc/parallel.pdf}, confidential = {n}, } Sebastian Buschjäger, Kuan-Hsun Chen, Jian-Jia Chen and Katharina Morik.Realization of Random Forest for Real-Time Evaluation through Tree Framing. In The IEEE International Conference on Data Mining (ICDM)Singapore, November 2018[BibTeX]@inproceedings { Buschjaeger2018, author = {Buschj\"ager, Sebastian and Chen, Kuan-Hsun and Chen, Jian-Jia and Morik, Katharina}, title = {Realization of Random Forest for Real-Time Evaluation through Tree Framing}, booktitle = {The IEEE International Conference on Data Mining (ICDM)}, year = {2018}, address = {Singapore}, month = {November}, keywords = {kuan}, confidential = {n}, } Anas Toma, Vincent Meyers and Jian-Jia Chen.Implementation and Evaluation of Multi-Mode Real-Time Tasks under Different Scheduling Algorithms. In the 14th annual workshop on Operating Systems Platforms for Embedded Real-Time applications (OSPERT 2018), Barcelona, SpainBarcelona, Spain, July 2018[BibTeX][PDF][Abstract]@inproceedings { Toma-OSPERT2018, author = {Toma, Anas and Meyers, Vincent and Chen, Jian-Jia}, title = {Implementation and Evaluation of Multi-Mode Real-Time Tasks under Different Scheduling Algorithms}, booktitle = {the 14th annual workshop on Operating Systems Platforms for Embedded Real-Time applications (OSPERT 2018), Barcelona, Spain}, year = {2018}, address = {Barcelona, Spain}, month = {July}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2018-toma-ospert.pdf}, confidential = {n}, abstract = {Tasks in the multi-mode real-time model have different execution modes according to an external input. Every mode represents a level of functionality where the tasks have different parameters. Such a model exists in automobiles where some of the tasks that control the engine should always adapt to its rotation speed. Many studies have evaluated the feasibility of such a model under different scheduling algorithms, however, only through simulation. This paper provides an empirical evaluation for the schedulability of the multi-mode real-time tasks under fixed- and dynamic-priority scheduling algorithms. Furthermore, an evaluation for the overhead of the scheduling algorithms is provided. The implementation and the evaluation were carried out in a real environment using Raspberry Pi hardware and FreeRTOS real-time operating system. A simulation for a crankshaft was performed to generate realistic tasks in addition to the syntheticones. Unlike expected, the results show that the Rate-Monotonic algorithm outperforms the Earliest Deadline First algorithm in scheduling tasks with relatively shorter periods.}, }Tasks in the multi-mode real-time model have different execution modes according to an external input. Every mode represents a level of functionality where the tasks have different parameters. Such a model exists in automobiles where some of the tasks that control the engine should always adapt to its rotation speed. Many studies have evaluated the feasibility of such a model under different scheduling algorithms, however, only through simulation. This paper provides an empirical evaluation for the schedulability of the multi-mode real-time tasks under fixed- and dynamic-priority scheduling algorithms. Furthermore, an evaluation for the overhead of the scheduling algorithms is provided. The implementation and the evaluation were carried out in a real environment using Raspberry Pi hardware and FreeRTOS real-time operating system. A simulation for a crankshaft was performed to generate realistic tasks in addition to the syntheticones. Unlike expected, the results show that the Rate-Monotonic algorithm outperforms the Earliest Deadline First algorithm in scheduling tasks with relatively shorter periods. Jan Eric Lenssen, Anas Toma, Albert Seebold, Victoria Shpacovitch, Pascal Libuschewski, Frank Weichert, Jian-Jia Chen and Roland Hergenröder.Real-Time Low SNR Signal Processing for Nanoparticle Analysis with Deep Neural Networks. In the 11th International Conference on Bio-Inspired Systems and Signal Processing (BIOSIGNALS 2018)Funchal, Portugal, January 2018, (Best Paper Award) [BibTeX][Abstract]@inproceedings { Eric-Biosignals18, author = {Lenssen, Jan Eric and Toma, Anas and Seebold, Albert and Shpacovitch, Victoria and Libuschewski, Pascal and Weichert, Frank and Chen, Jian-Jia and Hergenr{\"o}der, Roland}, title = {Real-Time Low SNR Signal Processing for Nanoparticle Analysis with Deep Neural Networks}, booktitle = {the 11th International Conference on Bio-Inspired Systems and Signal Processing (BIOSIGNALS 2018)}, year = {2018}, address = {Funchal, Portugal}, month = {January}, note = { (Best Paper Award) }, confidential = {n}, abstract = {In this work, we improve several steps of our Plasmon Assisted Microscopy Of Nano-sized Objects (PAMONO) sensor data processing pipeline through application of deep neural networks. The PAMONO-biosensor is a mobile nanoparticle sensor utilizing Surface Plasmon Resonance (SPR) imaging for quantification and analysis of nanoparticles in liquid or air samples. Characteristics of PAMONO sensor data are spatiotemporal blob-like structures with very low Signal-to-Noise Rtion (SNR), which indicate particle bindings and can be automatically analyzed with image processing methods. We propose and evaluate deep neural network architectures for spatiotemporal detection, time-series analysis and classification. We compare them to traditional methods like frequency domain or polygon shape features classified by a Random Forest classifier. It is shown that the application of deep learning enables the sensor to automatically detect and quantify 80 nm polystyrene particles and pushes the limits in blob detection with very low SNRs below one. In addition, we present benchmarks and show that real-time processing is achievable on consumer level desktop G RAPHICS P ROCESSING U NIT s (GPUs).}, }In this work, we improve several steps of our Plasmon Assisted Microscopy Of Nano-sized Objects (PAMONO) sensor data processing pipeline through application of deep neural networks. The PAMONO-biosensor is a mobile nanoparticle sensor utilizing Surface Plasmon Resonance (SPR) imaging for quantification and analysis of nanoparticles in liquid or air samples. Characteristics of PAMONO sensor data are spatiotemporal blob-like structures with very low Signal-to-Noise Rtion (SNR), which indicate particle bindings and can be automatically analyzed with image processing methods. We propose and evaluate deep neural network architectures for spatiotemporal detection, time-series analysis and classification. We compare them to traditional methods like frequency domain or polygon shape features classified by a Random Forest classifier. It is shown that the application of deep learning enables the sensor to automatically detect and quantify 80 nm polystyrene particles and pushes the limits in blob detection with very low SNRs below one. In addition, we present benchmarks and show that real-time processing is achievable on consumer level desktop G RAPHICS P ROCESSING U NIT s (GPUs). Georg Brüggen, Lea Schönberger and Jian-Jia Chen.Do Nothing, but Carefully: Fault Tolerance with Timing Guarantees for Multiprocessor Systems devoid of Online Adaptation. In The 23rd IEEE Pacific Rim International Symposium on Dependable Computing (PRDC 2018)Taipei, Taiwan, December 4-7 2018[BibTeX][Abstract]@inproceedings { brueggenetalPRDC2018, author = {Br\"uggen, Georg and Sch\"onberger, Lea and Chen, Jian-Jia}, title = {Do Nothing, but Carefully: Fault Tolerance with Timing Guarantees for Multiprocessor Systems devoid of Online Adaptation}, booktitle = {The 23rd IEEE Pacific Rim International Symposium on Dependable Computing (PRDC 2018)}, year = {2018}, address = {Taipei, Taiwan}, month = {December 4-7}, keywords = {georg, lea}, confidential = {n}, abstract = {Many practical real-time systems must be able to sustain several reliability threats induced by their physical environments that cause short-term abnormal system behavior, such as transient faults. To cope with this change of system behavior, online adaptions, which may introduce a high computation overhead, are performed in many cases to ensure the timeliness of the more important tasks while no guarantees are provided for the less important tasks. In this work, we propose a system model which does not require any online adaption, but, according to the concept of dynamic real-time guarantees, provides full timing guarantees as well as limited timing guarantees, depending on the system behavior. For the normal system behavior, timeliness is guaranteed for all tasks; otherwise, timeliness is guaranteed only for the more important tasks while bounded tardiness is ensured for the less important tasks. Aiming to provide such dynamic timing guarantees, we propose a suitable system model and discuss, how this can be established by means of partitioned as well as semi-partitioned strategies. Moreover, we propose an approach for handling abnormal behavior with a longer duration, such as intermittent faults or overheating of processors, by performing task migration in order to compensate the affected system component and to increase the system’s reliability. We show by comprehensive experiments that good acceptance ratios can be achieved under partitioned scheduling, which can be further improved under semi-partitioned strategies. In addition, we demonstrate that the proposed migration techniques lead to a reasonable trade-off between the decrease in schedulability and the gain in robustness of the system. The presented approaches can also be applied to mixed-criticality systems with two criticality levels.}, }Many practical real-time systems must be able to sustain several reliability threats induced by their physical environments that cause short-term abnormal system behavior, such as transient faults. To cope with this change of system behavior, online adaptions, which may introduce a high computation overhead, are performed in many cases to ensure the timeliness of the more important tasks while no guarantees are provided for the less important tasks. In this work, we propose a system model which does not require any online adaption, but, according to the concept of dynamic real-time guarantees, provides full timing guarantees as well as limited timing guarantees, depending on the system behavior. For the normal system behavior, timeliness is guaranteed for all tasks; otherwise, timeliness is guaranteed only for the more important tasks while bounded tardiness is ensured for the less important tasks. Aiming to provide such dynamic timing guarantees, we propose a suitable system model and discuss, how this can be established by means of partitioned as well as semi-partitioned strategies. Moreover, we propose an approach for handling abnormal behavior with a longer duration, such as intermittent faults or overheating of processors, by performing task migration in order to compensate the affected system component and to increase the system’s reliability. We show by comprehensive experiments that good acceptance ratios can be achieved under partitioned scheduling, which can be further improved under semi-partitioned strategies. In addition, we demonstrate that the proposed migration techniques lead to a reasonable trade-off between the decrease in schedulability and the gain in robustness of the system. The presented approaches can also be applied to mixed-criticality systems with two criticality levels. Lea Schönberger, Wen-Hung Huang, Georg von der Brüggen, Kuan-Hsun Chen and Jian-Jia Chen.Schedulability Analysis and Priority Assignment for Segmented Self-Suspending Tasks. In The 24th IEEE International Conference on Embedded and Real-Time Computing Systems and Applications (RTCSA)Hakodate, Japan, August 28-31 2018[BibTeX][Abstract]@inproceedings { schoenbergerRTCSA2018, author = {Sch\"onberger, Lea and Huang, Wen-Hung and Br\"uggen, Georg von der and Chen, Kuan-Hsun and Chen, Jian-Jia}, title = {Schedulability Analysis and Priority Assignment for Segmented Self-Suspending Tasks}, booktitle = {The 24th IEEE International Conference on Embedded and Real-Time Computing Systems and Applications (RTCSA)}, year = {2018}, address = {Hakodate, Japan}, month = {August 28-31}, keywords = {kuan, lea, Georg}, confidential = {n}, abstract = {Self-suspending behavior in real-time embedded systems can have a major and non-trivial negative impact on timing predictability. In this work, we investigate how to analyze the schedulability of segmented self-suspending task systems under a fixed-priority assignment. For this purpose, we introduce the multi-segment workload function as well as the maximum workload function in order to quantify the maximum interference from the higher-priority tasks when constructing our (sufficient) schedulability test. Moreover, we derive an optimal priority assignment with respect to our schedulability test since it is compatible with Audsley’s Optimal Priority Assignment (OPA). We show by means of comprehensive evaluations that our approach is highly effective concerning the number of schedulable task sets. Furthermore, one set of results reveals a rather non-intuitive observation, namely, that the worst-case suspension time of a computation segment should also be respected to improve the schedulability even if the suspension may finish earlier.}, }Self-suspending behavior in real-time embedded systems can have a major and non-trivial negative impact on timing predictability. In this work, we investigate how to analyze the schedulability of segmented self-suspending task systems under a fixed-priority assignment. For this purpose, we introduce the multi-segment workload function as well as the maximum workload function in order to quantify the maximum interference from the higher-priority tasks when constructing our (sufficient) schedulability test. Moreover, we derive an optimal priority assignment with respect to our schedulability test since it is compatible with Audsley’s Optimal Priority Assignment (OPA). We show by means of comprehensive evaluations that our approach is highly effective concerning the number of schedulable task sets. Furthermore, one set of results reveals a rather non-intuitive observation, namely, that the worst-case suspension time of a computation segment should also be respected to improve the schedulability even if the suspension may finish earlier. Georg von der Brüggen, Nico Piatkowski, Kuan-Hsun Chen, Jian-Jia Chen and Katharina Morik.Efficiently Approximating the Probability of Deadline Misses in Real-Time Systems. In 30th Euromicro Conference on Real-Time Systems (ECRTS 2018) 2018[BibTeX][Link][Abstract]@inproceedings { DBLP:conf/ecrts/BruggenPCCM18, author = {Br\"uggen, Georg von der and Piatkowski, Nico and Chen, Kuan-Hsun and Chen, Jian-Jia and Morik, Katharina}, title = {Efficiently Approximating the Probability of Deadline Misses in Real-Time Systems}, booktitle = {30th Euromicro Conference on Real-Time Systems (ECRTS 2018) }, year = {2018}, url = {http://drops.dagstuhl.de/opus/volltexte/2018/8997/pdf/LIPIcs-ECRTS-2018-6.pdf}, keywords = {kuan, Georg}, confidential = {n}, abstract = {This paper explores the probability of deadline misses for a set of constrained-deadline sporadic soft real-time tasks on uniprocessor platforms. We explore two directions to evaluate the probability whether a job of the task under analysis can finish its execution at (or before) a testing time point t. One approach is based on analytical upper bounds that can be efficiently computed in polynomial time at the price of precision loss for each testing point, derived from the well-known Hoeffding's inequality and the well-known Bernstein's inequality. Another approach convolutes the probability efficiently over multinomial distributions, exploiting a series of state space reduction techniques, i.e., pruning without any loss of precision, and approximations via unifying equivalent classes with a bounded loss of precision. We demonstrate the effectiveness of our approaches in a series of evaluations. Distinct from the convolution-based methods in the literature, which suffer from the high computation demand and are applicable only to task sets with a few tasks, our approaches can scale reasonably without losing much precision in terms of the derived probability of deadline misses. }, }This paper explores the probability of deadline misses for a set of constrained-deadline sporadic soft real-time tasks on uniprocessor platforms. We explore two directions to evaluate the probability whether a job of the task under analysis can finish its execution at (or before) a testing time point t. One approach is based on analytical upper bounds that can be efficiently computed in polynomial time at the price of precision loss for each testing point, derived from the well-known Hoeffding's inequality and the well-known Bernstein's inequality. Another approach convolutes the probability efficiently over multinomial distributions, exploiting a series of state space reduction techniques, i.e., pruning without any loss of precision, and approximations via unifying equivalent classes with a bounded loss of precision. We demonstrate the effectiveness of our approaches in a series of evaluations. Distinct from the convolution-based methods in the literature, which suffer from the high computation demand and are applicable only to task sets with a few tasks, our approaches can scale reasonably without losing much precision in terms of the derived probability of deadline misses. Anas Toma, Alexander Starinow, Jan Eric Lenssen and Jian-Jia Chen.Saving Energy for Cloud Applications in Mobile Devices using Nearby Resources. In the 26th Euromicro International Conference on Parallel, Distributed and Network-based Processing (PDP 2018)Cambridge, UK, March 2018.[BibTeX][PDF][Abstract]@inproceedings { Toma-PDP2018, author = {Toma, Anas and Starinow, Alexander and Lenssen, Jan Eric and Chen, Jian-Jia}, title = {Saving Energy for Cloud Applications in Mobile Devices using Nearby Resources}, booktitle = {the 26th Euromicro International Conference on Parallel, Distributed and Network-based Processing (PDP 2018)}, year = {2018.}, address = {Cambridge, UK}, month = {March}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2018-toma-pdp.pdf}, confidential = {n}, abstract = {In this paper, we present a middleware to save energy in mobile computing devices that offload tasks to a remote server in the cloud. Saving energy in these devices is very important to prolong the battery life and avoid overheating. The middleware uses an available nearby device called auxiliary server either as a surrogate for the remote one, or as a proxy to pass the data between the mobile device and the remote server. The main idea is to reduce the energy consumption of the communication with the remote server by using a high-speed or a low-power local connection with the auxiliary server instead. The paper also analyzes when it is beneficial to use the auxiliary server based on the response time from the remote server and the bandwidth of the remote connection. The proposed middleware is evaluated using different benchmarks, including commonly used applications in mobile devices, and simulations. Furthermore, it is compared to state-of-the art approaches in this area. The experiments show that The middleware is energy-efficient especially when the bandwidth of the remote communication is relatively low or the server is overloaded.}, }In this paper, we present a middleware to save energy in mobile computing devices that offload tasks to a remote server in the cloud. Saving energy in these devices is very important to prolong the battery life and avoid overheating. The middleware uses an available nearby device called auxiliary server either as a surrogate for the remote one, or as a proxy to pass the data between the mobile device and the remote server. The main idea is to reduce the energy consumption of the communication with the remote server by using a high-speed or a low-power local connection with the auxiliary server instead. The paper also analyzes when it is beneficial to use the auxiliary server based on the response time from the remote server and the bandwidth of the remote connection. The proposed middleware is evaluated using different benchmarks, including commonly used applications in mobile devices, and simulations. Furthermore, it is compared to state-of-the art approaches in this area. The experiments show that The middleware is energy-efficient especially when the bandwidth of the remote communication is relatively low or the server is overloaded. Junjie Shi, Kuan-Hsun Chen, Shuai Zhao, Wen-Hung Huang, Jian-Jia Chen and Andy Wellings.Implementation and Evaluation of Multiprocessor Resource Synchronization Protocol (MrsP) on LITMUSRT. In 13th Workshop on Operating Systems Platforms for Embedded Real-Time Applications 2017[BibTeX][PDF][Abstract]@inproceedings { OSPERT17, author = {Shi, Junjie and Chen, Kuan-Hsun and Zhao, Shuai and Huang, Wen-Hung and Chen, Jian-Jia and Wellings, Andy}, title = {Implementation and Evaluation of Multiprocessor Resource Synchronization Protocol (MrsP) on LITMUSRT}, booktitle = {13th Workshop on Operating Systems Platforms for Embedded Real-Time Applications}, year = {2017}, keywords = {kuan}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2017-junjie-ospert.pdf}, confidential = {n}, abstract = {Preventing race conditions or data corruptions for concurrent shared resource accesses of real-time tasks is a challenging problem. By adopting the resource synchronization protocols, such a problem has been studied in the literature, but there are not enough evaluations that consider the overhead from the implementations of different protocols. In this paper, we discuss our implementation of the Multiprocessor Resource Sharing Protocol (MrsP) and the Distributed Non-Preemptive Protocol (DNPP) on LITMUS RT . Both of them are released in open source under GNU General Public License (GPL2). To study the impact of the implementation overhead, we deploy different synchronization scenarios with generated task sets and measure the performance with respect to the worst-case response time. The results illustrate that generally the implementation overhead is acceptable, whereas some unexpected system overhead may happen under distributed synchronization protocols on LITMUSRT.}, }Preventing race conditions or data corruptions for concurrent shared resource accesses of real-time tasks is a challenging problem. By adopting the resource synchronization protocols, such a problem has been studied in the literature, but there are not enough evaluations that consider the overhead from the implementations of different protocols. In this paper, we discuss our implementation of the Multiprocessor Resource Sharing Protocol (MrsP) and the Distributed Non-Preemptive Protocol (DNPP) on LITMUS RT . Both of them are released in open source under GNU General Public License (GPL2). To study the impact of the implementation overhead, we deploy different synchronization scenarios with generated task sets and measure the performance with respect to the worst-case response time. The results illustrate that generally the implementation overhead is acceptable, whereas some unexpected system overhead may happen under distributed synchronization protocols on LITMUSRT. Kuan-Hsun Chen and Jian-Jia Chen.Probabilistic Schedulability Tests for Uniprocessor Fixed-Priority Scheduling under Soft Errors. In IEEE International Symposium on Industrial Embedded Systems (SIES), pages 1--8 2017[BibTeX][PDF][Abstract]@inproceedings { SIES2017, author = {Chen, Kuan-Hsun and Chen, Jian-Jia}, title = {Probabilistic Schedulability Tests for Uniprocessor Fixed-Priority Scheduling under Soft Errors}, booktitle = {IEEE International Symposium on Industrial Embedded Systems (SIES)}, year = {2017}, pages = {1--8}, keywords = {kuan}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2017-kuan-epst.pdf}, confidential = {n}, abstract = {Due to rising integrations, low voltage operations, and environmental influences such as electromagnetic interference and radiation, transient faults may cause soft errors and corrupt the execution state. Such soft errors can be recovered by applying fault-tolerant techniques. Therefore, the execution time of a job of a sporadic/periodic task may differ, depending upon the occurrence of soft errors and the applied error detection and recovery mechanisms. We model a periodic/sporadic real-time task under such a scenario by using two different worst-case execution times (WCETs), in which one is with the occurrence of soft errors and another is not. Based on a probabilistic soft-error model, the WCETs are hence with different probabilities. In this paper, we present efficient probabilistic schedulability tests that can be applied to verify the schedulability based on probabilistic arguments under fixed-priority scheduling on a uniprocessor system. We demonstrate how the Chernoff bounds can be used to calculate the task workloads based on their probabilistic WCETs. In addition, we further consider how to calculate the probability of -consecutive deadline misses of a task. The pessimism and the efficiency of our approaches are evaluated against the tighter and approximated convolution-based approaches, by running extensive evaluations under different soft-error rates. The evaluation results show that our approaches are effective to derive the probability of deadline misses and efficient with respect to the needed calculation time.}, }Due to rising integrations, low voltage operations, and environmental influences such as electromagnetic interference and radiation, transient faults may cause soft errors and corrupt the execution state. Such soft errors can be recovered by applying fault-tolerant techniques. Therefore, the execution time of a job of a sporadic/periodic task may differ, depending upon the occurrence of soft errors and the applied error detection and recovery mechanisms. We model a periodic/sporadic real-time task under such a scenario by using two different worst-case execution times (WCETs), in which one is with the occurrence of soft errors and another is not. Based on a probabilistic soft-error model, the WCETs are hence with different probabilities. In this paper, we present efficient probabilistic schedulability tests that can be applied to verify the schedulability based on probabilistic arguments under fixed-priority scheduling on a uniprocessor system. We demonstrate how the Chernoff bounds can be used to calculate the task workloads based on their probabilistic WCETs. In addition, we further consider how to calculate the probability of -consecutive deadline misses of a task. The pessimism and the efficiency of our approaches are evaluated against the tighter and approximated convolution-based approaches, by running extensive evaluations under different soft-error rates. The evaluation results show that our approaches are effective to derive the probability of deadline misses and efficient with respect to the needed calculation time. Olaf Neugebauer, Peter Marwedel, Roland Kühn and Michael Engel.Quality Evaluation Strategies for Approximate Computing in Embedded Systems. In Technological Innovation for Smart Systems: 8th IFIP WG 5.5/SOCOLNET Advanced Doctoral Conference on Computing, Electrical and Industrial Systems, DoCEIS 2017, Costa de Caparica, Portugal, May 3-5, 2017, Proceedings, pages 203--210 2017[BibTeX][Link]@inproceedings { Neugebauer2017, author = {Neugebauer, Olaf and Marwedel, Peter and K{\"u}hn, Roland and Engel, Michael}, title = {Quality Evaluation Strategies for Approximate Computing in Embedded Systems}, booktitle = {Technological Innovation for Smart Systems: 8th IFIP WG 5.5/SOCOLNET Advanced Doctoral Conference on Computing, Electrical and Industrial Systems, DoCEIS 2017, Costa de Caparica, Portugal, May 3-5, 2017, Proceedings}, year = {2017}, editor = {Camarinha-Matos, Luis M. and Parreira-Rocha, Mafalda and Ramezani, Javaneh}, pages = {203--210}, publisher = {Springer International Publishing}, url = {http://dx.doi.org/10.1007/978-3-319-56077-9_19}, confidential = {n}, } Helena Kotthaus, Jakob Richter, Andreas Lang, Janek Thomas, Bernd Bischl, Peter Marwedel, Jörg Rahnenführer and Michel Lang.RAMBO: Resource-Aware Model-Based Optimization with Scheduling for Heterogeneous Runtimes and a Comparison with Asynchronous Model-Based Optimization. In Proceedings of the 11th International Conference: Learning and Intelligent Optimization (LION 11), pages 180--195 2017[BibTeX][Link][Abstract]@inproceedings { kotthaus/2017a, author = {Kotthaus, Helena and Richter, Jakob and Lang, Andreas and Thomas, Janek and Bischl, Bernd and Marwedel, Peter and Rahnenf\"uhrer, J\"org and Lang, Michel}, title = {RAMBO: Resource-Aware Model-Based Optimization with Scheduling for Heterogeneous Runtimes and a Comparison with Asynchronous Model-Based Optimization}, booktitle = {Proceedings of the 11th International Conference: Learning and Intelligent Optimization (LION 11)}, year = {2017}, pages = {180--195}, publisher = {Lecture Notes in Computer Science, Springer}, url = {http://www.springer.com/de/book/9783319694030}, confidential = {n}, abstract = {Sequential model-based optimization is a popular technique for global optimization of expensive black-box functions. It uses a regression model to approximate the objective function and iteratively proposes new interesting points. Deviating from the original formulation, it is often indispensable to apply parallelization to speed up the computation. This is usually achieved by evaluating as many points per iteration as there are workers available. However, if runtimes of the objective function are heterogeneous, resources might be wasted by idle workers. Our new knapsack-based scheduling approach aims at increasing the effectiveness of parallel optimization by efficient resource utilization. Derived from an extra regression model we use runtime predictions of point evaluations to efficiently map evaluations to workers and reduce idling. We compare our approach to five established parallelization strategies on a set of continuous functions with heterogeneous runtimes. Our benchmark covers comparisons of synchronous and asynchronous model-based approaches and investigates the scalability.}, }Sequential model-based optimization is a popular technique for global optimization of expensive black-box functions. It uses a regression model to approximate the objective function and iteratively proposes new interesting points. Deviating from the original formulation, it is often indispensable to apply parallelization to speed up the computation. This is usually achieved by evaluating as many points per iteration as there are workers available. However, if runtimes of the objective function are heterogeneous, resources might be wasted by idle workers. Our new knapsack-based scheduling approach aims at increasing the effectiveness of parallel optimization by efficient resource utilization. Derived from an extra regression model we use runtime predictions of point evaluations to efficiently map evaluations to workers and reduce idling. We compare our approach to five established parallelization strategies on a set of continuous functions with heterogeneous runtimes. Our benchmark covers comparisons of synchronous and asynchronous model-based approaches and investigates the scalability. Helena Kotthaus, Andreas Lang, Olaf Neugebauer and Peter Marwedel.R goes Mobile: Efficient Scheduling for Parallel R Programs on Heterogeneous Embedded Systems. In Abstract Booklet of the International R User Conference (UseR!), pages 74Brussels, Belgium, July 2017[BibTeX][Link]@inproceedings { kotthaus/2017b, author = {Kotthaus, Helena and Lang, Andreas and Neugebauer, Olaf and Marwedel, Peter}, title = {R goes Mobile: Efficient Scheduling for Parallel R Programs on Heterogeneous Embedded Systems}, booktitle = {Abstract Booklet of the International R User Conference (UseR!)}, year = {2017}, pages = {74}, address = {Brussels, Belgium}, month = {July}, url = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2017_user_kotthaus.pdf }, confidential = {n}, } Jian-Jia Chen, Georg von der Brüggen, Wen-Hung Huang and Robert I. Davis.On the Pitfalls of Resource Augmentation Factors and Utilization Bounds in Real-Time Scheduling. In 29th Euromicro Conference on Real-Time Systems, {ECRTS}, pages 9:1--9:25Dubrovnik, Croatia, June 27-30 2017[BibTeX][PDF][Link]@inproceedings { DBLP:conf/ecrts/ChenBHD17, author = {Chen, Jian-Jia and Br\"uggen, Georg von der and Huang, Wen-Hung and Davis, Robert I.}, title = {On the Pitfalls of Resource Augmentation Factors and Utilization Bounds in Real-Time Scheduling}, booktitle = {29th Euromicro Conference on Real-Time Systems, {ECRTS}}, year = {2017}, pages = {9:1--9:25}, address = {Dubrovnik, Croatia}, month = {June 27-30}, url = {https://doi.org/10.4230/LIPIcs.ECRTS.2017.9}, keywords = {Georg}, file = {http://drops.dagstuhl.de/opus/volltexte/2017/7161/pdf/LIPIcs-ECRTS-2017-9.pdf}, confidential = {n}, } Jian-Jia Chen, Georg von der Brüggen, Wen-Hung Huang and Cong Liu.State of the art for scheduling and analyzing self-suspending sporadic real-time tasks. In 23rd {IEEE} International Conference on Embedded and Real-Time Computing Systems and Applications {RTCSA} , pages 1--10Hsinchu, Taiwan, August 16-18 2017, Invited paper[BibTeX][PDF][Link]@inproceedings { DBLP:conf/rtcsa/ChenBH017, author = {Chen, Jian-Jia and Br\"uggen, Georg von der and Huang, Wen-Hung and Liu, Cong}, title = {State of the art for scheduling and analyzing self-suspending sporadic real-time tasks}, booktitle = {23rd {IEEE} International Conference on Embedded and Real-Time Computing Systems and Applications {RTCSA} }, year = {2017}, pages = {1--10}, address = {Hsinchu, Taiwan}, month = {August 16-18}, note = {Invited paper}, url = {http://doi.ieeecomputersociety.org/10.1109/RTCSA.2017.8046321}, keywords = {Georg}, file = {media/documents/publications/downloads/2017-chen-RTCSA.suspension-review.pdf}, confidential = {n}, } Jian-Jia Chen, Wen-Hung Huang, Zheng Dong and Cong Liu.Fixed-priority scheduling of mixed soft and hare real-time tasks on multiprocessors. In 23rd {IEEE} International Conference on Embedded and Real-Time Computing Systems and Applications, {RTCSA} , pages 1--10Hsinchu, Taiwan, August 16-18 2017[BibTeX][PDF][Link]@inproceedings { DBLP:conf/rtcsa/ChenHD017, author = {Chen, Jian-Jia and Huang, Wen-Hung and Dong, Zheng and Liu, Cong}, title = {Fixed-priority scheduling of mixed soft and hare real-time tasks on multiprocessors}, booktitle = {23rd {IEEE} International Conference on Embedded and Real-Time Computing Systems and Applications, {RTCSA} }, year = {2017}, pages = {1--10}, address = {Hsinchu, Taiwan}, month = {August 16-18}, url = {http://doi.ieeecomputersociety.org/10.1109/RTCSA.2017.8046312}, file = {media/documents/publications/downloads/2017-chen-RTCSA-SRT.pdf}, confidential = {n}, } Georg von der Brüggen, Wen-Hung Huang and Jian-Jia Chen.Hybrid self-suspension models in real-time embedded systems. In 23rd {IEEE} International Conference on Embedded and Real-Time Computing Systems and Applications, {RTCSA} , pages 1--9Hsinchu, Taiwan, August 16-18 2017[BibTeX][PDF][Link]@inproceedings { DBLP:conf/rtcsa/BruggenHC17, author = {Br\"uggen, Georg von der and Huang, Wen-Hung and Chen, Jian-Jia}, title = {Hybrid self-suspension models in real-time embedded systems}, booktitle = {23rd {IEEE} International Conference on Embedded and Real-Time Computing Systems and Applications, {RTCSA} }, year = {2017}, pages = {1--9}, address = {Hsinchu, Taiwan}, month = {August 16-18}, url = {http://doi.ieeecomputersociety.org/10.1109/RTCSA.2017.8046328}, keywords = {Georg, kevin}, file = {media/documents/publications/downloads/2017-vdbruggen-RTCSA-hybrid-suspension.pdf}, confidential = {n}, } Georg von der Brüggen, Jian-Jia Chen, Wen-Hung Huang and Maolin Yang.Release Enforcement in Resource-Oriented Partitioned Scheduling for Multiprocessor Systems. In 25th International Conference on Real-Time Networks and Systems (RTNS) 2017[BibTeX][PDF]@inproceedings { DBLP:conf/rtns/Bruggen17rop, author = {Br\"uggen, Georg von der and Chen, Jian-Jia and Huang, Wen-Hung and Yang, Maolin}, title = {Release Enforcement in Resource-Oriented Partitioned Scheduling for Multiprocessor Systems}, booktitle = {25th International Conference on Real-Time Networks and Systems (RTNS)}, year = {2017}, keywords = {Georg, kevin}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2017_rtns_brueggen_rop.pdf}, confidential = {n}, } Georg von der Brüggen, Niklas Ueter, Jian-Jia Chen and Matthias Freier.Parametric Utilization Bounds for Implicit-Deadline Periodic Tasks in Automotive Systems. In 25th International Conference on Real-Time Networks and Systems (RTNS)Grenoble, France, 2017[BibTeX][PDF]@inproceedings { DBLP:conf/rtns/Bruggen17automotive, author = {Br\"uggen, Georg von der and Ueter, Niklas and Chen, Jian-Jia and Freier, Matthias}, title = {Parametric Utilization Bounds for Implicit-Deadline Periodic Tasks in Automotive Systems}, booktitle = {25th International Conference on Real-Time Networks and Systems (RTNS)}, year = {2017}, address = {Grenoble, France}, keywords = {Georg}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2017_brueggen_rtns_automotive.pdf}, confidential = {n}, } Wen-Hung Huang and Jian-Jia Chen.Self-Suspension Real-Time Tasks under Fixed-Relative-Deadline Fixed-Priority Scheduling. In Design, Automation and Test in Europe (DATE)Dresden, Germany, 14 -18th Mar 2016[BibTeX][PDF][Abstract]@inproceedings { HC16, author = {Huang, Wen-Hung and Chen, Jian-Jia}, title = {Self-Suspension Real-Time Tasks under Fixed-Relative-Deadline Fixed-Priority Scheduling}, booktitle = {Design, Automation and Test in Europe (DATE)}, year = {2016}, address = {Dresden, Germany}, month = {14 -18th Mar}, keywords = {kevin}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/Self-Suspension-EDA-FP-6p}, confidential = {n}, abstract = {Self-suspension is becoming a prominent characteristic in real-time systems such as: (i) I/O-intensive systems (ii) multi-core processors, and (iii) computation offloading systems with coprocessors, like Graphics Processing Units (GPUs). In this work, we study self-suspension systems under fixed-priority (FP) fixed-relative-deadline (FRD) algorithm by using release enforcement to control self-suspension tasks' behavior. Specifically, we use equal-deadline assignment (EDA) to assign the release phases of computations and suspensions. We provide analysis for deriving the speedup factor of the FP FRD scheduler using suspension-laxity-monotonic (SLM) priority assignment. This is the first positive result to provide bounded speedup factor guarantees for general multi-segment self-suspending task systems.}, }Self-suspension is becoming a prominent characteristic in real-time systems such as: (i) I/O-intensive systems (ii) multi-core processors, and (iii) computation offloading systems with coprocessors, like Graphics Processing Units (GPUs). In this work, we study self-suspension systems under fixed-priority (FP) fixed-relative-deadline (FRD) algorithm by using release enforcement to control self-suspension tasks' behavior. Specifically, we use equal-deadline assignment (EDA) to assign the release phases of computations and suspensions. We provide analysis for deriving the speedup factor of the FP FRD scheduler using suspension-laxity-monotonic (SLM) priority assignment. This is the first positive result to provide bounded speedup factor guarantees for general multi-segment self-suspending task systems. Wen-Hung Huang, Jian-Jia Chen and Jan Reineke.MIRROR: Symmetric Timing Analysis for Real-Time Tasks on Multicore Platforms with Shared Resources. In Design Automation Conference (DAC)Austin, TX, USA, June 05-09 2016[BibTeX][PDF][Abstract]@inproceedings { WR16, author = {Huang, Wen-Hung and Chen, Jian-Jia and Reineke, Jan}, title = {MIRROR: Symmetric Timing Analysis for Real-Time Tasks on Multicore Platforms with Shared Resources}, booktitle = {Design Automation Conference (DAC)}, year = {2016}, address = {Austin, TX, USA}, month = {June 05-09 }, keywords = {kevin}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/PID4192125-mirror.pdf}, confidential = {n}, abstract = {The emergence of multicore and manycore platforms poses a big challenge for the design of real-time embedded systems, especially for timing analysis. We observe in this paper that response-time analysis for multicore platforms with shared resources can be symmetrically approached from two per- spectives: a core-centric and a shared-resource-centric per- spective. The common \core-centric" perspective is that a task executes on a core until it suspends the execution due to shared resource accesses. The potentially less intuitive \shared-resource-centric" perspective is that a task performs requests on shared resources until suspending itself back to perform computation on its respective core. Based on the above observation, we provide a pseudo- polynomial-time schedulability test and response-time anal- ysis for constrained-deadline sporadic task systems. In addi- tion, we propose a task partitioning algorithm that achieves a speedup factor of 7, compared to the optimal schedule. This constitutes the rst result in this research line with a speedup factor guarantee. The experimental evaluation demonstrates that our approach can yield high acceptance ratios if the tasks have only a few resource access segments.}, }The emergence of multicore and manycore platforms poses a big challenge for the design of real-time embedded systems, especially for timing analysis. We observe in this paper that response-time analysis for multicore platforms with shared resources can be symmetrically approached from two per- spectives: a core-centric and a shared-resource-centric per- spective. The common \core-centric" perspective is that a task executes on a core until it suspends the execution due to shared resource accesses. The potentially less intuitive \shared-resource-centric" perspective is that a task performs requests on shared resources until suspending itself back to perform computation on its respective core. Based on the above observation, we provide a pseudo- polynomial-time schedulability test and response-time anal- ysis for constrained-deadline sporadic task systems. In addi- tion, we propose a task partitioning algorithm that achieves a speedup factor of 7, compared to the optimal schedule. This constitutes the rst result in this research line with a speedup factor guarantee. The experimental evaluation demonstrates that our approach can yield high acceptance ratios if the tasks have only a few resource access segments. Wen-Hung Huang and Jian-Jia Chen.Utilization Bounds on Allocating Rate-Monotonic Scheduled Multi-Mode Tasks on Multiprocessor Systems. In Design Automation Conference (DAC)Austin, TX, USA, June 05-09 2016[BibTeX][PDF][Abstract]@inproceedings { HC16a, author = {Huang, Wen-Hung and Chen, Jian-Jia}, title = {Utilization Bounds on Allocating Rate-Monotonic Scheduled Multi-Mode Tasks on Multiprocessor Systems}, booktitle = {Design Automation Conference (DAC)}, year = {2016}, address = {Austin, TX, USA}, month = {June 05-09 }, keywords = {kevin}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/PID4192127-multimode-mp.pdf}, confidential = {n}, abstract = {Formal models used for representing recurrent real-time pro- cesses have traditionally been characterized by a collection of jobs that are released periodically. However, such a model- ing may result in resource under-utilization in systems whose behaviors are not entirely periodic. For instance, tasks in cyber-physical system (CPS) may change their service levels, e.g., periods and/or execution times, to adapt to the changes of environments. In this work, we study a model that is a generalization of the periodic task model, called multi-mode task model: a task has several modes speci ed with di erent execution times and periods to switch during runtime, inde- pendent of other tasks. Moreover, we study the problem of allocating a set of multi-mode tasks on a homogeneous multiprocessor system. We present a scheduling algorithm using any reasonable allocation decreasing (RAD) algorithm for task allocations for scheduling multi-mode tasks on mul- tiprocessor systems. We prove that this algorithm achieves 38% utilization for implicit-deadline rate-monotonic (RM) scheduled multi-mode tasks on multiprocessor systems.}, }Formal models used for representing recurrent real-time pro- cesses have traditionally been characterized by a collection of jobs that are released periodically. However, such a model- ing may result in resource under-utilization in systems whose behaviors are not entirely periodic. For instance, tasks in cyber-physical system (CPS) may change their service levels, e.g., periods and/or execution times, to adapt to the changes of environments. In this work, we study a model that is a generalization of the periodic task model, called multi-mode task model: a task has several modes speci ed with di erent execution times and periods to switch during runtime, inde- pendent of other tasks. Moreover, we study the problem of allocating a set of multi-mode tasks on a homogeneous multiprocessor system. We present a scheduling algorithm using any reasonable allocation decreasing (RAD) algorithm for task allocations for scheduling multi-mode tasks on mul- tiprocessor systems. We prove that this algorithm achieves 38% utilization for implicit-deadline rate-monotonic (RM) scheduled multi-mode tasks on multiprocessor systems. Kuan-Hsun Chen, Georg von der Brüggen and Jian-Jia Chen.Overrun Handling for Mixed-Criticality Support in RTEMS. In Workshop on Mixed-Criticality Systems 2016[BibTeX][PDF][Abstract]@inproceedings { WMC2016, author = {Chen, Kuan-Hsun and Br\"uggen, Georg von der and Chen, Jian-Jia}, title = {Overrun Handling for Mixed-Criticality Support in RTEMS}, booktitle = {Workshop on Mixed-Criticality Systems}, year = {2016}, keywords = {kuan, Georg}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2016-wmc.pdf}, confidential = {n}, abstract = {Real-time operating systems are not only used in embedded real-time systems but also useful for the simulation and validation of those systems. During the evaluation of our paper about Systems with Dynamic Real-Time Guarantees that appears in RTSS 2016 we discovered certain unexpected system behavior in the open-source real-time operating system RTEMS. In the current implementation of RTEMS (version 4.11), overruns of an implicit-deadline task, i.e., deadline misses, result in unexpected system behavior as they may lead to a shift of the release pattern of the task. This also has the consequence that some task instances are not released as they should be. In this paper we explain the reason why such problems occur in RTEMS and our solutions.}, }Real-time operating systems are not only used in embedded real-time systems but also useful for the simulation and validation of those systems. During the evaluation of our paper about Systems with Dynamic Real-Time Guarantees that appears in RTSS 2016 we discovered certain unexpected system behavior in the open-source real-time operating system RTEMS. In the current implementation of RTEMS (version 4.11), overruns of an implicit-deadline task, i.e., deadline misses, result in unexpected system behavior as they may lead to a shift of the release pattern of the task. This also has the consequence that some task instances are not released as they should be. In this paper we explain the reason why such problems occur in RTEMS and our solutions. Jian-Jia Chen, Wen-Hung Huang and Cong Liu.k2Q: A Quadratic-Form Response Time and Schedulability Analysis Framework for Utilization-Based Analysis. In Real-Time Systems Symposium (RTSS)Porto, Portugal, Nov. 29 - Dec. 2 2016[BibTeX]@inproceedings { RTSS2016-k2Q, author = {Chen, Jian-Jia and Huang, Wen-Hung and Liu, Cong}, title = {k2Q: A Quadratic-Form Response Time and Schedulability Analysis Framework for Utilization-Based Analysis}, booktitle = {Real-Time Systems Symposium (RTSS)}, year = {2016}, address = {Porto, Portugal}, month = {Nov. 29 - Dec. 2}, keywords = {kevin }, confidential = {n}, } Kuan-Hsun Chen, Björn Bönninghoff, Jian-Jia Chen and Peter Marwedel.Compensate or Ignore? Meeting Control Robustness Requirements through Adaptive Soft-Error Handling. In Languages, Compilers, Tools and Theory for Embedded Systems (LCTES)Santa Barbara, CA, U.S.A., June 2016[BibTeX][PDF][Link][Abstract]@inproceedings { Chenlctes2016, author = {Chen, Kuan-Hsun and B\"onninghoff, Bj\"orn and Chen, Jian-Jia and Marwedel, Peter}, title = {Compensate or Ignore? Meeting Control Robustness Requirements through Adaptive Soft-Error Handling}, booktitle = {Languages, Compilers, Tools and Theory for Embedded Systems (LCTES)}, year = {2016}, address = {Santa Barbara, CA, U.S.A.}, month = {June}, organization = {ACM}, url = {http://dx.doi.org/10.1145/2907950.2907952}, keywords = {kuan}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2016-khchen-lctes.pdf}, confidential = {n}, abstract = {To avoid catastrophic events like unrecoverable system failures on mobile and embedded systems caused by soft-errors, software- based error detection and compensation techniques have been proposed. Methods like error-correction codes or redundant execution can offer high flexibility and allow for application-specific fault-tolerance selection without the needs of special hardware supports. However, such software-based approaches may lead to system overload due to the execution time overhead. An adaptive deployment of such techniques to meet both application requirements and system constraints is desired. From our case study, we observe that a control task can tolerate limited errors with acceptable performance loss. Such tolerance can be modeled as a (m, k) constraint which requires at least m correct runs out of any k consecutive runs to be correct. In this paper, we discuss how a given (m, k) constraint can be satisfied by adopting patterns of task instances with individual error detection and compensation capabilities. We introduce static strategies and provide a formal feasibility analysis for validation. Furthermore, we develop an adaptive scheme that extends our initial approach with online awareness that increases efficiency while preserving analysis results. The effectiveness of our method is shown in a real-world case study as well as for synthesized task sets.}, }To avoid catastrophic events like unrecoverable system failures on mobile and embedded systems caused by soft-errors, software- based error detection and compensation techniques have been proposed. Methods like error-correction codes or redundant execution can offer high flexibility and allow for application-specific fault-tolerance selection without the needs of special hardware supports. However, such software-based approaches may lead to system overload due to the execution time overhead. An adaptive deployment of such techniques to meet both application requirements and system constraints is desired. From our case study, we observe that a control task can tolerate limited errors with acceptable performance loss. Such tolerance can be modeled as a (m, k) constraint which requires at least m correct runs out of any k consecutive runs to be correct. In this paper, we discuss how a given (m, k) constraint can be satisfied by adopting patterns of task instances with individual error detection and compensation capabilities. We introduce static strategies and provide a formal feasibility analysis for validation. Furthermore, we develop an adaptive scheme that extends our initial approach with online awareness that increases efficiency while preserving analysis results. The effectiveness of our method is shown in a real-world case study as well as for synthesized task sets. Ingo Korb, Helena Kotthaus and Peter Marwedel.mmapcopy: Efficient Memory Footprint Reduction using Application Knowledge. In Proceedings of the 31st Annual ACM Symposium on Applied Computing Pisa, Italy, 2016[BibTeX][PDF][Abstract]@inproceedings { korb:2016:sac, author = {Korb, Ingo and Kotthaus, Helena and Marwedel, Peter}, title = {mmapcopy: Efficient Memory Footprint Reduction using Application Knowledge}, booktitle = {Proceedings of the 31st Annual ACM Symposium on Applied Computing }, year = {2016}, address = {Pisa, Italy}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2016-korb-mmapcopy.pdf}, confidential = {n}, abstract = {Memory requirements can be a limiting factor for programs dealing with large data structures. Especially interpreted programming languages that are used to deal with large vectors like R suffer from memory overhead when copying such data structures. Avoiding data duplication directly in the application can reduce the memory requirements. Alternatively, generic kernel-level memory reduction functionality like deduplication and compression can lower the amount of memory required, but they need to compensate for missing application knowledge by utilizing more CPU time, leading to excessive overhead. To allow new optimizations based on the application’s knowledge about its own memory utilization, we propose to introduce a new system call. This system call uses the existing copy-on-write functionality of the Linux kernel to avoid duplicating memory when data is copied. Our experiments using real-world benchmarks written in the R language show that our approach can yield significant improvement in CPU time compared to Kernel Samepage Merging without compromising the amount of memory saved. }, }Memory requirements can be a limiting factor for programs dealing with large data structures. Especially interpreted programming languages that are used to deal with large vectors like R suffer from memory overhead when copying such data structures. Avoiding data duplication directly in the application can reduce the memory requirements. Alternatively, generic kernel-level memory reduction functionality like deduplication and compression can lower the amount of memory required, but they need to compensate for missing application knowledge by utilizing more CPU time, leading to excessive overhead. To allow new optimizations based on the application’s knowledge about its own memory utilization, we propose to introduce a new system call. This system call uses the existing copy-on-write functionality of the Linux kernel to avoid duplicating memory when data is copied. Our experiments using real-world benchmarks written in the R language show that our approach can yield significant improvement in CPU time compared to Kernel Samepage Merging without compromising the amount of memory saved. Jakob Richter, Helena Kotthaus, Bernd Bischl, Peter Marwedel, Jörg Rahnenführer and Michel Lang.Faster Model-Based Optimization through Resource-Aware Scheduling Strategies. In Proceedings of the 10th International Conference: Learning and Intelligent Optimization (LION 10). vol. 10079 of Lecture Notes in Computer Science., pages 267--273 2016[BibTeX][PDF][Link][Abstract]@inproceedings { kotthaus/2016a, author = {Richter, Jakob and Kotthaus, Helena and Bischl, Bernd and Marwedel, Peter and Rahnenf\"uhrer, J\"org and Lang, Michel}, title = {Faster Model-Based Optimization through Resource-Aware Scheduling Strategies}, booktitle = {Proceedings of the 10th International Conference: Learning and Intelligent Optimization (LION 10).}, year = {2016}, volume = {vol. 10079 of Lecture Notes in Computer Science.}, pages = {267--273}, publisher = {Springer}, url = {http://link.springer.com/chapter/10.1007/978-3-319-50349-3_22}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2016_kotthaus_lion10.pdf }, confidential = {n}, abstract = {We present a Resource-Aware Model-Based Optimization framework RAMBO that leads to efficient utilization of parallel computer architectures through resource-aware scheduling strategies. Conventional MBO fits a regression model on the set of already evaluated configurations and their observed performances to guide the search. Due to its inherent sequential nature, an efficient parallel variant can not directly be derived, as only the most promising configuration w.r.t. an infill criterion is evaluated in each iteration. This issue has been addressed by generalized infill criteria in order to propose multiple points simultaneously for parallel execution in each sequential step. However, these extensions in general neglect systematic runtime differences in the configuration space which often leads to underutilized systems. We estimate runtimes using an additional surrogate model to improve the scheduling and demonstrate that our framework approach already yields improved resource utilization on two exemplary classification tasks.}, }We present a Resource-Aware Model-Based Optimization framework RAMBO that leads to efficient utilization of parallel computer architectures through resource-aware scheduling strategies. Conventional MBO fits a regression model on the set of already evaluated configurations and their observed performances to guide the search. Due to its inherent sequential nature, an efficient parallel variant can not directly be derived, as only the most promising configuration w.r.t. an infill criterion is evaluated in each iteration. This issue has been addressed by generalized infill criteria in order to propose multiple points simultaneously for parallel execution in each sequential step. However, these extensions in general neglect systematic runtime differences in the configuration space which often leads to underutilized systems. We estimate runtimes using an additional surrogate model to improve the scheduling and demonstrate that our framework approach already yields improved resource utilization on two exemplary classification tasks. Helena Kotthaus, Jakob Richter, Andreas Lang, Michel Lang and Peter Marwedel.Resource-Aware Scheduling Strategies for Parallel Machine Learning R Programs through RAMBO. In Abstract Booklet of the International R User Conference (UseR!) 195USA, Stanford, June 2016[BibTeX][Link][Abstract]@inproceedings { kotthaus:2016b, author = {Kotthaus, Helena and Richter, Jakob and Lang, Andreas and Lang, Michel and Marwedel, Peter}, title = {Resource-Aware Scheduling Strategies for Parallel Machine Learning R Programs through RAMBO}, booktitle = {Abstract Booklet of the International R User Conference (UseR!)}, year = {2016}, number = {195}, address = {USA, Stanford}, month = {June}, url = {http://user2016.org/files/abs-book.pdf}, confidential = {n}, abstract = {We present resource-aware scheduling strategies for parallel R programs leading to efficient utilization of parallel computer architectures by estimating resource demands. We concentrate on applications that consist of independent tasks. The R programming language is increasingly used to process large data sets in parallel, which requires a high amount of resources. One important application is parameter tuning of machine learning algorithms where evaluations need to be executed in parallel to reduce runtime. Here, resource demands of tasks heavily vary depending on the algorithm configuration. Running such an application in a naive parallel way leads to inefficient resource utilization and thus to long runtimes. Therefore, the R package “parallel” offers a scheduling strategy, called “load balancing”. It dynamically allocates tasks to worker processes. This option is recommended when tasks have widely different computation times or if computer architectures are heterogeneous. We analyzed memory and CPU utilization of parallel applications with our TraceR profiling tool and found that the load balancing mechanism is not sufficient for parallel tasks with high variance in resource demands. A scheduling strategy needs to know resource demands of a task before execution to efficiently map applications to available resources. Therefore, we build a regression model to estimate resource demands based on previous evaluated tasks. Resource estimates like runtime are then used to guide our scheduling strategies. Those strategies are integrated in our RAMBO (Resource-Aware Model-Based Optimization) Framework. Compared to standard mechanisms of the parallel package our approach yields improved resource utilization.}, }We present resource-aware scheduling strategies for parallel R programs leading to efficient utilization of parallel computer architectures by estimating resource demands. We concentrate on applications that consist of independent tasks. The R programming language is increasingly used to process large data sets in parallel, which requires a high amount of resources. One important application is parameter tuning of machine learning algorithms where evaluations need to be executed in parallel to reduce runtime. Here, resource demands of tasks heavily vary depending on the algorithm configuration. Running such an application in a naive parallel way leads to inefficient resource utilization and thus to long runtimes. Therefore, the R package “parallel” offers a scheduling strategy, called “load balancing”. It dynamically allocates tasks to worker processes. This option is recommended when tasks have widely different computation times or if computer architectures are heterogeneous. We analyzed memory and CPU utilization of parallel applications with our TraceR profiling tool and found that the load balancing mechanism is not sufficient for parallel tasks with high variance in resource demands. A scheduling strategy needs to know resource demands of a task before execution to efficiently map applications to available resources. Therefore, we build a regression model to estimate resource demands based on previous evaluated tasks. Resource estimates like runtime are then used to guide our scheduling strategies. Those strategies are integrated in our RAMBO (Resource-Aware Model-Based Optimization) Framework. Compared to standard mechanisms of the parallel package our approach yields improved resource utilization. Wen-Hung Huang, Maolin Yang and Jian-Jia Chen.Resource-Oriented Partitioned Scheduling in Multiprocessor Systems: How to Partition and How to Share?. In Real-Time Systems Symposium (RTSS)Porto, Portugal, Nov. 29 - Dec. 2 2016, (Outstanding Paper Award). We identified some typos and revised the paper on May. 29th 2017. Revised version[BibTeX][PDF][Link]@inproceedings { RTSS2016-resource, author = {Huang, Wen-Hung and Yang, Maolin and Chen, Jian-Jia}, title = {Resource-Oriented Partitioned Scheduling in Multiprocessor Systems: How to Partition and How to Share?}, booktitle = {Real-Time Systems Symposium (RTSS)}, year = {2016}, address = {Porto, Portugal}, month = {Nov. 29 - Dec. 2}, publisher = { Revised version with latexdiff}, note = { (Outstanding Paper Award). We identified some typos and revised the paper on May. 29th 2017. Revised version}, url = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2017-synchronization-revised-diff.pdf}, keywords = {kevin}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2016-kevin-synchronization_RTSS_camera_ready.pdf}, confidential = {n}, } Jian-Jia Chen.Computational Complexity and Speedup Factors Analyses for Self-Suspending Tasks. In Real-Time Systems Symposium (RTSS)Porto, Portugal, Nov. 29 - Dec. 2 2016[BibTeX][PDF]@inproceedings { RTSS2016-suspension, author = {Chen, Jian-Jia}, title = {Computational Complexity and Speedup Factors Analyses for Self-Suspending Tasks}, booktitle = {Real-Time Systems Symposium (RTSS)}, year = {2016}, address = {Porto, Portugal}, month = {Nov. 29 - Dec. 2}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2016-JJ-Suspension-Complexity.pdf}, confidential = {n}, } Jian-Jia Chen.Partitioned Multiprocessor Fixed-Priority Scheduling of Sporadic Real-Time Tasks. In Euromicro Conference on Real-Time Systems (ECRTS)Toulouse, France, 05-08, July 2016, (Outstanding Paper Award) An extended version is available via arXiv: http://arxiv.org/abs/1505.04693[BibTeX][PDF][Abstract]@inproceedings { ChenECRTS2016-Partition, author = {Chen, Jian-Jia}, title = {Partitioned Multiprocessor Fixed-Priority Scheduling of Sporadic Real-Time Tasks}, booktitle = {Euromicro Conference on Real-Time Systems (ECRTS)}, year = {2016}, address = {Toulouse, France}, month = {05-08, July}, note = { (Outstanding Paper Award) An extended version is available via arXiv: http://arxiv.org/abs/1505.04693}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2016-chen-ecrts16-partition.pdf}, confidential = {n}, abstract = { Partitioned multiprocessor scheduling has been widely accepted in academia and industry to statically assign and partition real-time tasks onto identical multiprocessor systems. This paper studies fixed-priority partitioned multiprocessor scheduling for sporadic real-time systems, in which deadline-monotonic scheduling is applied on each processor. Prior to this paper, the best known results are by Fisher, Baruah, and Baker with speedup factors $4-\frac{2}{M}$ and $3-\frac{1}{M}$ for arbitrary-deadline and constrained-deadline sporadic real-time task systems, respectively, where $M$ is the number of processors. We show that a greedy mapping strategy has a speedup factor $3-\frac{1}{M}$ when considering task systems with arbitrary deadlines. Such a factor holds for polynomial-time schedulability tests and exponential-time (exact) schedulability tests. Moreover, we also improve the speedup factor to $2.84306$ when considering constrained-deadline task systems. We also provide tight examples when the fitting strategy in the mapping stage is arbitrary and $M$ is sufficiently large. For both constrained- and arbitrary-deadline task systems, the analytical result surprisingly shows that using exact tests does not gain theoretical benefits (with respect to speedup factors) if the speedup factor analysis is oblivious of the particular fitting strategy used.}, } Partitioned multiprocessor scheduling has been widely accepted in academia and industry to statically assign and partition real-time tasks onto identical multiprocessor systems. This paper studies fixed-priority partitioned multiprocessor scheduling for sporadic real-time systems, in which deadline-monotonic scheduling is applied on each processor. Prior to this paper, the best known results are by Fisher, Baruah, and Baker with speedup factors $4-2{M}$ and $3-1{M}$ for arbitrary-deadline and constrained-deadline sporadic real-time task systems, respectively, where $M$ is the number of processors. We show that a greedy mapping strategy has a speedup factor $3-1{M}$ when considering task systems with arbitrary deadlines. Such a factor holds for polynomial-time schedulability tests and exponential-time (exact) schedulability tests. Moreover, we also improve the speedup factor to $2.84306$ when considering constrained-deadline task systems. We also provide tight examples when the fitting strategy in the mapping stage is arbitrary and $M$ is sufficiently large. For both constrained- and arbitrary-deadline task systems, the analytical result surprisingly shows that using exact tests does not gain theoretical benefits (with respect to speedup factors) if the speedup factor analysis is oblivious of the particular fitting strategy used. Georg von der Brüggen, Kuan-Hsun Chen, Wen-Hung Huang and Jian-Jia Chen.Systems with Dynamic Real-Time Guarantees in Uncertain and Faulty Execution Environments. In Real-Time Systems Symposium (RTSS)Porto, Portugal, Nov. 29 - Dec. 2 2016[BibTeX][PDF]@inproceedings { RTSS2016-dynamic-faulty, author = {Br\"uggen, Georg von der and Chen, Kuan-Hsun and Huang, Wen-Hung and Chen, Jian-Jia}, title = {Systems with Dynamic Real-Time Guarantees in Uncertain and Faulty Execution Environments}, booktitle = {Real-Time Systems Symposium (RTSS)}, year = {2016}, address = {Porto, Portugal}, month = {Nov. 29 - Dec. 2}, keywords = {georg, kevin, kuan}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/mixed_critical.pdf}, confidential = {n}, } Jian-Jia Chen, Geoffrey Nelissen and Wen-Hung Kevin Huang.A Unifying Response Time Analysis Framework for Dynamic Self-Suspending Tasks. In Euromicro Conference on Real-Time Systems (ECRTS)Toulouse, France, 05-08, July 2016, An extended version is available in technical report #850, Technische Universität Dortmund - Fakultät für Informatik[BibTeX][PDF][Abstract]@inproceedings { ChenECRTS2016-suspension, author = {Chen, Jian-Jia and Geoffrey Nelissen, and Huang, Wen-Hung Kevin}, title = {A Unifying Response Time Analysis Framework for Dynamic Self-Suspending Tasks}, booktitle = {Euromicro Conference on Real-Time Systems (ECRTS)}, year = {2016}, address = {Toulouse, France}, month = {05-08, July}, note = {An extended version is available in technical report #850, Technische Universit\"at Dortmund - Fakult\"at f\"ur Informatik}, keywords = {kevin}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2016-chen-ecrts-suspension.pdf}, confidential = {n}, abstract = { For real-time embedded systems, self-suspending behaviors can cause substantial performance/schedulability degradations. In this paper, we focus on preemptive fixed-priority scheduling for the dynamic self-suspension task model on uniprocessor. This model assumes that a job of a task can dynamically suspend itself during its execution (for instance, to wait for shared resources or access co-processors or external devices). The total suspension time of a job is upper-bounded, but this dynamic behavior drastically influences the interference generated by this task on lower-priority tasks. The state-of-the-art results for this task model can be classified into three categories (i) modeling suspension as computation, (ii) modeling suspension as release jitter, and (iii) modeling suspension as a blocking term. However, several results associated to the release jitter approach have been recently proven to be erroneous, and the concept of modeling suspension as blocking was never formally proven correct. This paper presents a unifying response time analysis framework for the dynamic self-suspending task model. We provide a rigorous proof and show that the existing analyses pertaining to the three categories mentioned above are analytically dominated by our proposed solution. Therefore, all those techniques are in fact correct, but they are inferior to the proposed response time analysis in this paper. The evaluation results show that our analysis framework can generate huge improvements (an increase of up to $50\%$ of the number of task sets deemed schedulable) over these state-of-the-art analyses.}, } For real-time embedded systems, self-suspending behaviors can cause substantial performance/schedulability degradations. In this paper, we focus on preemptive fixed-priority scheduling for the dynamic self-suspension task model on uniprocessor. This model assumes that a job of a task can dynamically suspend itself during its execution (for instance, to wait for shared resources or access co-processors or external devices). The total suspension time of a job is upper-bounded, but this dynamic behavior drastically influences the interference generated by this task on lower-priority tasks. The state-of-the-art results for this task model can be classified into three categories (i) modeling suspension as computation, (ii) modeling suspension as release jitter, and (iii) modeling suspension as a blocking term. However, several results associated to the release jitter approach have been recently proven to be erroneous, and the concept of modeling suspension as blocking was never formally proven correct. This paper presents a unifying response time analysis framework for the dynamic self-suspending task model. We provide a rigorous proof and show that the existing analyses pertaining to the three categories mentioned above are analytically dominated by our proposed solution. Therefore, all those techniques are in fact correct, but they are inferior to the proposed response time analysis in this paper. The evaluation results show that our analysis framework can generate huge improvements (an increase of up to $50%$ of the number of task sets deemed schedulable) over these state-of-the-art analyses. Matthias Freier and Jian-Jia Chen.Sporadic Task Handling in Time-Triggered Systems. In Proceedings of the 19th International Workshop on Software and Compilers for Embedded Systems, {SCOPES} 2016, Sankt Goar, Germany, May 23-25, 2016, pages 135--144 2016[BibTeX][Link]@inproceedings { DBLP:conf/scopes/FreierC16, author = {Freier, Matthias and Chen, Jian-Jia}, title = {Sporadic Task Handling in Time-Triggered Systems}, booktitle = {Proceedings of the 19th International Workshop on Software and Compilers for Embedded Systems, {SCOPES} 2016, Sankt Goar, Germany, May 23-25, 2016}, year = {2016}, pages = {135--144}, url = {http://doi.org/10.1145/2906363.2906383}, confidential = {n}, } Georg von der Brüggen, Wen-Hung Huang, Jian-Jia Chen and Cong Liu.Uniprocessor Scheduling Strategies for Self-Suspending Task Systems. In Proceedings of the 24th International Conference on Real-Time Networks and Systems (RTNS), pages 119--128 October 2016[BibTeX][PDF][Link][Abstract]@inproceedings { vonderBruggen:2016:USS:2997465.2997497, author = {Br\"uggen, Georg von der and Huang, Wen-Hung and Chen, Jian-Jia and Liu, Cong}, title = {Uniprocessor Scheduling Strategies for Self-Suspending Task Systems}, booktitle = {Proceedings of the 24th International Conference on Real-Time Networks and Systems (RTNS)}, year = {2016}, pages = {119--128}, month = {October}, publisher = {ACM}, url = {http://dl.acm.org/ft_gateway.cfm?id=2997497\&ftid=1804918\&dwn=1\&CFID=691780547\&CFTOKEN=64912419}, keywords = {georg, kevin}, file = {http://dl.acm.org/ft_gateway.cfm?id=2997497\&ftid=1804918\&dwn=1\&CFID=691780547\&CFTOKEN=64912419}, confidential = {n}, abstract = {(open access)}, }(open access) Anas Toma, Santiago Pagani, Jian-Jia Chen, Wolfgang Karl and Jörg Henkel.An Energy-Efficient Middleware for Computation Offloading in Real-Time Embedded Systems. In the 22th IEEE International Conference on Embedded and Real-Time Computing Systems and Applications (RTCSA 2016)Daegu, South Korea, August 2016.[BibTeX][PDF][Abstract]@inproceedings { Toma-RTCSA2016, author = {Toma, Anas and Pagani, Santiago and Chen, Jian-Jia and Karl, Wolfgang and Henkel, J\"org}, title = {An Energy-Efficient Middleware for Computation Offloading in Real-Time Embedded Systems}, booktitle = {the 22th IEEE International Conference on Embedded and Real-Time Computing Systems and Applications (RTCSA 2016)}, year = {2016.}, address = {Daegu, South Korea}, month = {August}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2016-toma-rtcsa.pdf}, confidential = {n}, abstract = {Embedded systems have limited resources, such as computation capabilities and battery life. The Dynamic Voltage and Frequency Scaling (DVFS) technique is used to save energy by running the processor of the embedded system at low voltage and frequency levels. However, this prolongs the execution time, which may cause potential deadline misses for real-time tasks. In this paper, we propose a general-purpose middleware to reduce the energy consumption in embedded systems without violating the real-time constraints. The algorithms in the middleware adopt the computation offloading concept to reduce the workload on the processor of the embedded system by sending the computation-intensive tasks to a powerful server. The algorithms are further combined with the DVFS technique to find the running frequency (or speed) such that the energy consumption is minimized and the real-time constraints are satisfied. The evaluation shows that our approach reduces the average energy consumption down to nearly 60%, compared to executing all the tasks locally at the maximum processor speed.}, }Embedded systems have limited resources, such as computation capabilities and battery life. The Dynamic Voltage and Frequency Scaling (DVFS) technique is used to save energy by running the processor of the embedded system at low voltage and frequency levels. However, this prolongs the execution time, which may cause potential deadline misses for real-time tasks. In this paper, we propose a general-purpose middleware to reduce the energy consumption in embedded systems without violating the real-time constraints. The algorithms in the middleware adopt the computation offloading concept to reduce the workload on the processor of the embedded system by sending the computation-intensive tasks to a powerful server. The algorithms are further combined with the DVFS technique to find the running frequency (or speed) such that the energy consumption is minimized and the real-time constraints are satisfied. The evaluation shows that our approach reduces the average energy consumption down to nearly 60%, compared to executing all the tasks locally at the maximum processor speed. Wen-Hung Huang and Jian-Jia Chen.Response Time Bounds for Sporadic Arbitrary-Deadline Tasks under Global Fixed-Priority Scheduling on Multiprocessors. In International Conference on Real-Time Networks and Systems (RTNS)Lille, France, 4-6th Nov 2015[BibTeX][PDF]@inproceedings { CH15, author = {Huang, Wen-Hung and Chen, Jian-Jia}, title = {Response Time Bounds for Sporadic Arbitrary-Deadline Tasks under Global Fixed-Priority Scheduling on Multiprocessors}, booktitle = {International Conference on Real-Time Networks and Systems (RTNS)}, year = {2015}, address = {Lille, France}, month = {4-6th Nov}, keywords = {kevin}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2015-huang_response-time-bounded-rtns.pdf}, confidential = {n}, } Huang Wen-Hung, Jian-Jia Chen, Husheng Zhou and Cong Liu.PASS: Priority Assignment of Real-Time Tasks with Dynamic Suspending Behavior under Fixed-Priority Scheduling. In Design Automation Conference (DAC), San Francisco, CA, USA 2015[BibTeX][PDF][Abstract]@inproceedings { Wal.15a, author = {Wen-Hung, Huang and Chen, Jian-Jia and Zhou, Husheng and Liu, Cong}, title = {PASS: Priority Assignment of Real-Time Tasks with Dynamic Suspending Behavior under Fixed-Priority Scheduling}, booktitle = {Design Automation Conference (DAC), San Francisco, CA, USA}, year = {2015}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/pass-dac-2015.pdf}, confidential = {n}, abstract = {Self-suspension is becoming an increasingly prominent char- acteristic in real-time systems such as: (i) I/O-intensive systems, where applications interact intensively with I/O devices, (ii) multi-core processors, where tasks running on di erent cores have to synchronize and communicate with each other, and (iii) computation ooading systems with coprocessors, like Graphics Processing Units (GPUs). In this paper, we show that rate-monotonic (RM), deadline- monotonic (DM) and laxity-monotonic (LM) scheduling will perform rather poor in dynamic self-suspending systems in terms of speed-up factors. On the other hand, the proposed PASS approach is guaranteed to nd a feasible priority as- signment on a speed-2 uniprocessor, if one exists on a unit- speed processor. We evaluate the feasibility of the proposed approach via a case study implementation. Furthermore, the e ectiveness of the proposed approach is also shown via extensive simulation results.}, }Self-suspension is becoming an increasingly prominent char- acteristic in real-time systems such as: (i) I/O-intensive systems, where applications interact intensively with I/O devices, (ii) multi-core processors, where tasks running on di erent cores have to synchronize and communicate with each other, and (iii) computation ooading systems with coprocessors, like Graphics Processing Units (GPUs). In this paper, we show that rate-monotonic (RM), deadline- monotonic (DM) and laxity-monotonic (LM) scheduling will perform rather poor in dynamic self-suspending systems in terms of speed-up factors. On the other hand, the proposed PASS approach is guaranteed to nd a feasible priority as- signment on a speed-2 uniprocessor, if one exists on a unit- speed processor. We evaluate the feasibility of the proposed approach via a case study implementation. Furthermore, the e ectiveness of the proposed approach is also shown via extensive simulation results. Kuan-Hsun Chen, Jian-Jia Chen, Florian Kriebel, Semeen Rehman, Muhammad Shafique and J Henkel.Reliability-Aware Task Mapping on Many-Cores with Performance Heterogeneity. In ESWEEK Workshop on Resiliency in Embedded Electronic Systems 2015[BibTeX][PDF][Abstract]@inproceedings { REES2015, author = {Chen, Kuan-Hsun and Chen, Jian-Jia and Kriebel, Florian and Rehman, Semeen and Shafique, Muhammad and Henkel, J}, title = {Reliability-Aware Task Mapping on Many-Cores with Performance Heterogeneity}, booktitle = {ESWEEK Workshop on Resiliency in Embedded Electronic Systems}, year = {2015}, keywords = {kuan}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2015-kuan-rees.pdf}, confidential = {n}, abstract = {In this paper we explore how to efficiently allocate the tasks onto many-cores by using RMT to improve the overall dependability, with respect to both timing and functional correctness while also accounting for application tasks with multiple compiled versions. Such multiple reliable versions can be generated by using the reliability-aware compilers like [2] and [8], exhibiting diverse performance and reliability properties. By applying multiple reliable task versions and RMT, we are able to exploit the optimization space at both software and hardware-levels while exploring different area, execution time, and achieved reliability tradeoffs. The timing correctness can be defined as the deadline miss rate, which is typically adopted as the quality of service (QoS) metric in many practical real-time applications.}, }In this paper we explore how to efficiently allocate the tasks onto many-cores by using RMT to improve the overall dependability, with respect to both timing and functional correctness while also accounting for application tasks with multiple compiled versions. Such multiple reliable versions can be generated by using the reliability-aware compilers like [2] and [8], exhibiting diverse performance and reliability properties. By applying multiple reliable task versions and RMT, we are able to exploit the optimization space at both software and hardware-levels while exploring different area, execution time, and achieved reliability tradeoffs. The timing correctness can be defined as the deadline miss rate, which is typically adopted as the quality of service (QoS) metric in many practical real-time applications. Matthias Freier and Jian-Jia Chen.Time-Triggered Communication Scheduling Analysis for Real-Time Multicore Systems. In 10th IEEE International Symposium on Industrial Embedded Systems (SIES), Siegen, Germany,, June 8-10 2015[BibTeX][PDF][Abstract]@inproceedings { Freier2015, author = {Freier, Matthias and Chen, Jian-Jia}, title = {Time-Triggered Communication Scheduling Analysis for Real-Time Multicore Systems}, booktitle = {10th IEEE International Symposium on Industrial Embedded Systems (SIES), }, year = {2015}, address = {Siegen, Germany,}, month = {June 8-10}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2015-chen-SIES.pdf}, confidential = {n}, abstract = {Scheduling of real-time applications for multicore platforms has become an important research topic. For analyzing the timing satisfactions of real-time tasks, most researches in the literature assume independent tasks. However, industrial applications are usually with fully tangled dependencies among the tasks. Independence of the tasks provides a very nice abstraction, whereas dependent structures due to the tangled executions of the tasks are closer to the real systems. This paper studies the scheduling policies and the schedulabil-ity analysis based on independent tasks by hiding the execution dependencies with additional timing parameters. Our scheduling policy relates to the well-known periodic task model, but in contrast, tasks are able to communicate with each other. A feasible task set requires an analysis for each core and the communication infrastructure, which can be performed indi-vidually by decoupling computation from communication in a distributed system. By using a Time-Triggered Constant Phase (TTCP) scheduler, each task receives certain time-slots in the hyper-period of the task set, which ensures a time-predictable communication impact. In this paper, we provide several algorithms to derive the time-slot for each task. Further, we found a fast heuristic algorithm to calculate the time-slot for each task, which is capable to reach a core utilization of 90% by considering typical industrial applications. Finally, experiments show the effectiveness of our heuristic and the performance in different settings.}, }Scheduling of real-time applications for multicore platforms has become an important research topic. For analyzing the timing satisfactions of real-time tasks, most researches in the literature assume independent tasks. However, industrial applications are usually with fully tangled dependencies among the tasks. Independence of the tasks provides a very nice abstraction, whereas dependent structures due to the tangled executions of the tasks are closer to the real systems. This paper studies the scheduling policies and the schedulabil-ity analysis based on independent tasks by hiding the execution dependencies with additional timing parameters. Our scheduling policy relates to the well-known periodic task model, but in contrast, tasks are able to communicate with each other. A feasible task set requires an analysis for each core and the communication infrastructure, which can be performed indi-vidually by decoupling computation from communication in a distributed system. By using a Time-Triggered Constant Phase (TTCP) scheduler, each task receives certain time-slots in the hyper-period of the task set, which ensures a time-predictable communication impact. In this paper, we provide several algorithms to derive the time-slot for each task. Further, we found a fast heuristic algorithm to calculate the time-slot for each task, which is capable to reach a core utilization of 90% by considering typical industrial applications. Finally, experiments show the effectiveness of our heuristic and the performance in different settings. Huang Wen-Hung and Jian-Jia Chen.Techniques for Schedulability Analysis in Mode Change Systems under Fixed-Priority Scheduling. In IEEE International Conference on Embedded and Real-Time Computing Systems and Applications (RTCSA)Hong Kong, August 19-21, 2015 2015, (Best Paper Award). We identified some typos in the proofs of Theorems 5 and 6, on May. 29th 2017. Revised version [BibTeX][PDF][Link]@inproceedings { WC15-RTCSA, author = {Wen-Hung, Huang and Chen, Jian-Jia}, title = {Techniques for Schedulability Analysis in Mode Change Systems under Fixed-Priority Scheduling}, booktitle = {IEEE International Conference on Embedded and Real-Time Computing Systems and Applications (RTCSA)}, year = {2015}, address = {Hong Kong}, month = {August 19-21, 2015}, note = { (Best Paper Award). We identified some typos in the proofs of Theorems 5 and 6, on May. 29th 2017. Revised version }, url = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2015-multimode-revised-diff.pdf}, keywords = {kevin}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/polynomial-mode-change.pdf}, confidential = {n}, } Jian-Jia Chen, Wen-Hung Huang and Cong Liu.k2U: A General Framework from k-Point Effective Schedulability Analysis to Utilization-Based Tests. In Real-Time Systems Symposium (RTSS) Dec. 1-4 2015[BibTeX][PDF][Abstract]@inproceedings { ChenHLRTSS2015, author = {Chen, Jian-Jia and Huang, Wen-Hung and Liu, Cong}, title = {k2U: A General Framework from k-Point Effective Schedulability Analysis to Utilization-Based Tests}, booktitle = {Real-Time Systems Symposium (RTSS)}, year = {2015}, month = {Dec. 1-4}, keywords = {kevin}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2015-chen-RTSS.pdf}, confidential = {n}, abstract = {To deal with a large variety of workloads in different application domains in real-time embedded systems, a number of expressive task models have been developed. For each individual task model, researchers tend to develop different types of techniques for deriving schedulability tests with different computation complexity and performance. In this paper, we present a general schedulability analysis framework, namely the k2U framework, that can be potentially applied to analyze a large set of real-time task models under any fixed-priority scheduling algorithm, on both uniprocessor and multiprocessor scheduling. The key to k2U is a k-point effective schedulability test, which can be viewed as a “blackbox” interface. For any task model, if a corresponding k-point effective schedulability test can be constructed, then a sufficient utilization-based test can be automatically derived. We show the generality of k2U by applying it to different task models, which results in new and improved tests compared to the state-of-the-art.}, }To deal with a large variety of workloads in different application domains in real-time embedded systems, a number of expressive task models have been developed. For each individual task model, researchers tend to develop different types of techniques for deriving schedulability tests with different computation complexity and performance. In this paper, we present a general schedulability analysis framework, namely the k2U framework, that can be potentially applied to analyze a large set of real-time task models under any fixed-priority scheduling algorithm, on both uniprocessor and multiprocessor scheduling. The key to k2U is a k-point effective schedulability test, which can be viewed as a “blackbox” interface. For any task model, if a corresponding k-point effective schedulability test can be constructed, then a sufficient utilization-based test can be automatically derived. We show the generality of k2U by applying it to different task models, which results in new and improved tests compared to the state-of-the-art. Helena Kotthaus, Ingo Korb and Peter Marwedel.Performance Analysis for Parallel R Programs: Towards Efficient Resource Utilization. In Abstract Booklet of the International R User Conference (UseR!), pages 66Aalborg, Denmark, June 2015[BibTeX][Link]@inproceedings { kotthaus/2015a, author = {Kotthaus, Helena and Korb, Ingo and Marwedel, Peter}, title = {Performance Analysis for Parallel R Programs: Towards Efficient Resource Utilization}, booktitle = {Abstract Booklet of the International R User Conference (UseR!)}, year = {2015}, pages = {66}, address = {Aalborg, Denmark}, month = {June}, url = {http://user2015.math.aau.dk/docs/useR2015-BookOfAbstracts.pdf}, confidential = {n}, } Helena Kotthaus, Ingo Korb and Peter Marwedel.Distributed Performance Analysis for R. In R Implementation, Optimization and Tooling Workshop (RIOT)Prag, Czech, July 2015[BibTeX][Link]@inproceedings { kotthaus/2015b, author = {Kotthaus, Helena and Korb, Ingo and Marwedel, Peter}, title = {Distributed Performance Analysis for R}, booktitle = {R Implementation, Optimization and Tooling Workshop (RIOT)}, year = {2015}, address = {Prag, Czech}, month = {July}, url = {http://2015.ecoop.org/track/RIOT-2015-papers#program}, confidential = {n}, } Andreas Heinig, Florian Schmoll, Björn Bönninghoff, Peter Marwedel and Michael Engel.FAME: Flexible Real-Time Aware Error Correction by Combining Application Knowledge and Run-Time Information. In Proceedings of the 11th Workshop on Silicon Errors in Logic - System Effects (SELSE) 2015[BibTeX][PDF]@inproceedings { heinig:2015:selse, author = {Heinig, Andreas and Schmoll, Florian and B\"onninghoff, Bj\"orn and Marwedel, Peter and Engel, Michael}, title = {FAME: Flexible Real-Time Aware Error Correction by Combining Application Knowledge and Run-Time Information}, booktitle = {Proceedings of the 11th Workshop on Silicon Errors in Logic - System Effects (SELSE)}, year = {2015}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2015-heinig-selse2015.pdf}, confidential = {n}, } Georg von der Brüggen, Jian-Jia Chen and Wen-Hung Huang.Schedulability and Optimization Analysis for Non-Preemptive Static Priority Scheduling Based on Task Utilization and Blocking Factors. In Proceedings of Euromicro Conference on Real-Time Systems (ECRTS)Lund, Sweden, July 8-10 2015, We identified an error and revised the paper on Aug. 14th 2015. Short summary of erratum [BibTeX][PDF][Abstract]@inproceedings { brueggen:2015:ecrts, author = {Br\"uggen, Georg von der and Chen, Jian-Jia and Huang, Wen-Hung}, title = {Schedulability and Optimization Analysis for Non-Preemptive Static Priority Scheduling Based on Task Utilization and Blocking Factors}, booktitle = {Proceedings of Euromicro Conference on Real-Time Systems (ECRTS)}, year = {2015}, address = {Lund, Sweden}, month = {July 8-10}, note = {We identified an error and revised the paper on Aug. 14th 2015. Short summary of erratum }, keywords = {georg, kevin}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2015_brueggen_ecrts.pdf}, confidential = {n}, abstract = {For real time task sets, allowing preemption is often considered to be important to ensure the schedulability, as it allows high-priority tasks to be allocated to the processor nearly immediately. However, preemptive scheduling also introduces some additional overhead and may not be allowed for some hardware components, which motivates the needs of non-preemptive or limited-preemptive scheduling. We present a safe sufficient schedulability test for non-preemptive (NP) fixed priority scheduling that can verify the schedulability for Deadline Monotonic (DM-NP) and Rate Monotonic (RM-NP) scheduling in linear time, if task orders according to priority and period are given. This test leads to a better upper bound on the speedup factor for DM-NP and RM-NP in comparison to Earliest Deadline First (EDF-NP) than previously known, closing the gab between lower and upper bound. We improve our test, resulting in interesting properties of the blocking time that allow to determine schedulability by only considering the schedulability of the preemptive case if some conditions are met. Furthermore, we present a utilization bound for RM-NP, based on the ratio \gamma >0 of the upper bound of the maximum blocking time to the execution time, significantly improving previous results.}, }For real time task sets, allowing preemption is often considered to be important to ensure the schedulability, as it allows high-priority tasks to be allocated to the processor nearly immediately. However, preemptive scheduling also introduces some additional overhead and may not be allowed for some hardware components, which motivates the needs of non-preemptive or limited-preemptive scheduling. We present a safe sufficient schedulability test for non-preemptive (NP) fixed priority scheduling that can verify the schedulability for Deadline Monotonic (DM-NP) and Rate Monotonic (RM-NP) scheduling in linear time, if task orders according to priority and period are given. This test leads to a better upper bound on the speedup factor for DM-NP and RM-NP in comparison to Earliest Deadline First (EDF-NP) than previously known, closing the gab between lower and upper bound. We improve our test, resulting in interesting properties of the blocking time that allow to determine schedulability by only considering the schedulability of the preemptive case if some conditions are met. Furthermore, we present a utilization bound for RM-NP, based on the ratio \gamma >0 of the upper bound of the maximum blocking time to the execution time, significantly improving previous results. Olaf Neugebauer, Michael Engel and Peter Marwedel.Multi-Objective Aware Communication Optimization for Resource-Restricted Embedded Systems. In Proceedings of Architecture of Computing Systems. Proceedings, ARCS 2015[BibTeX][PDF][Abstract]@inproceedings { neugebauer:2015:arcs, author = {Neugebauer, Olaf and Engel, Michael and Marwedel, Peter}, title = {Multi-Objective Aware Communication Optimization for Resource-Restricted Embedded Systems}, booktitle = {Proceedings of Architecture of Computing Systems. Proceedings, ARCS}, year = {2015}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2015-neugebauer-arcs.pdf}, confidential = {n}, abstract = {Creating efficient parallel software for current embedded multicore systems is a complex and error-prone task. While automatic parallelization tools help to exploit the performance of multicores, most of these systems waste optimization opportunities since they neglect to consider hardware details such as communication performance and memory hierarchies. In addition, most tools do not allow multi-criterial optimization for objectives such as performance and energy. These approaches are especially relevant in the embedded domain. In this paper we present PICO, an approach that enables multi-objective optimization of embedded parallel programs. In combination with a state-of-the-art parallelization approach for sequential C code, PICO uses high-level models and simulators for performance and energy consumption optimization. As a result, PICO generates a set of Pareto-optimal solutions using a genetic algorithm-based optimization. These solutions allow an embedded system designer to choose a parallelization solution which exhibits a suitable trade-off between the required speedup and the resulting energy consumption according to a given system's requirements. Using PICO, we were able to reduce energy consumption by about 35% compared to the sequential execution for a heterogeneous architecture. Further, runtime reductions by roughly 55% were achieved for a benchmark on a homogeneous platform.}, }Creating efficient parallel software for current embedded multicore systems is a complex and error-prone task. While automatic parallelization tools help to exploit the performance of multicores, most of these systems waste optimization opportunities since they neglect to consider hardware details such as communication performance and memory hierarchies. In addition, most tools do not allow multi-criterial optimization for objectives such as performance and energy. These approaches are especially relevant in the embedded domain. In this paper we present PICO, an approach that enables multi-objective optimization of embedded parallel programs. In combination with a state-of-the-art parallelization approach for sequential C code, PICO uses high-level models and simulators for performance and energy consumption optimization. As a result, PICO generates a set of Pareto-optimal solutions using a genetic algorithm-based optimization. These solutions allow an embedded system designer to choose a parallelization solution which exhibits a suitable trade-off between the required speedup and the resulting energy consumption according to a given system's requirements. Using PICO, we were able to reduce energy consumption by about 35% compared to the sequential execution for a heterogeneous architecture. Further, runtime reductions by roughly 55% were achieved for a benchmark on a homogeneous platform. Olaf Neugebauer, Pascal Libuschewski, Michael Engel, Heinrich Mueller and Peter Marwedel.Plasmon-based Virus Detection on Heterogeneous Embedded Systems. In Proceedings of Workshop on Software & Compilers for Embedded Systems (SCOPES) 2015[BibTeX][PDF][Abstract]@inproceedings { neugebauer2015:scopes, author = {Neugebauer, Olaf and Libuschewski, Pascal and Engel, Michael and Mueller, Heinrich and Marwedel, Peter}, title = {Plasmon-based Virus Detection on Heterogeneous Embedded Systems}, booktitle = {Proceedings of Workshop on Software \& Compilers for Embedded Systems (SCOPES) }, year = {2015}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2015-neugebauer-scopes.pdf}, confidential = {n}, abstract = {Embedded systems, e.g. in computer vision applications, are expected to provide significant amounts of computing power to process large data volumes. Many of these systems, such as used in medical diagnosis, are mobile devices and face significant challenges to provide sufficient performance while operating on a constrained energy budget. Modern embedded MPSoC platforms use heterogeneous CPU and GPU cores providing a large number of optimization parameters. This allows to find useful trade-offs between energy consumption and performance for a given application. In this paper, we describe how the complex data processing required for PAMONO, a novel type of biosensor for the detection of biological viruses, can efficiently be implemented on a state-of-the-art heterogeneous MPSoC platform. An additional optimization dimension explored is the achieved quality of service. Reducing the virus detection accuracy enables additional optimizations not achievable by modifying hardware or software parameters alone. Instead of relying on often inaccurate simulation models, our design space exploration employs a hardware-in-the-loop approach to evaluate the performance and energy consumption on the embedded target platform. Trade-offs between performance, energy and accuracy are controlled by a genetic algorithm running on a PC control system which deploys the evaluation tasks to a number of connected embedded boards. Using our optimization approach, we are able to achieve frame rates meeting the requirements without losing accuracy. Further, our approach is able to reduce the energy consumption by 93% with a still reasonable detection quality.}, }Embedded systems, e.g. in computer vision applications, are expected to provide significant amounts of computing power to process large data volumes. Many of these systems, such as used in medical diagnosis, are mobile devices and face significant challenges to provide sufficient performance while operating on a constrained energy budget. Modern embedded MPSoC platforms use heterogeneous CPU and GPU cores providing a large number of optimization parameters. This allows to find useful trade-offs between energy consumption and performance for a given application. In this paper, we describe how the complex data processing required for PAMONO, a novel type of biosensor for the detection of biological viruses, can efficiently be implemented on a state-of-the-art heterogeneous MPSoC platform. An additional optimization dimension explored is the achieved quality of service. Reducing the virus detection accuracy enables additional optimizations not achievable by modifying hardware or software parameters alone. Instead of relying on often inaccurate simulation models, our design space exploration employs a hardware-in-the-loop approach to evaluate the performance and energy consumption on the embedded target platform. Trade-offs between performance, energy and accuracy are controlled by a genetic algorithm running on a PC control system which deploys the evaluation tasks to a number of connected embedded boards. Using our optimization approach, we are able to achieve frame rates meeting the requirements without losing accuracy. Further, our approach is able to reduce the energy consumption by 93% with a still reasonable detection quality. Peter Munk, Matthias Freier, Jan Richling and Jian-Jia Chen.Dynamic Guaranteed Service Communication on Best-Effort Networks-on-Chip. In 23rd Euromicro International Conference on Parallel, Distributed, and Network-Based Processing, {PDP} 2015, Turku, Finland, March 4-6, 2015, pages 353--360 2015[BibTeX][PDF][Abstract]@inproceedings { DBLP:conf/pdp/MunkFRC15, author = {Munk, Peter and Freier, Matthias and Richling, Jan and Chen, Jian-Jia}, title = {Dynamic Guaranteed Service Communication on Best-Effort Networks-on-Chip}, booktitle = {23rd Euromicro International Conference on Parallel, Distributed, and Network-Based Processing, {PDP} 2015, Turku, Finland, March 4-6, 2015}, year = {2015}, pages = {353--360}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2015-chen-PDP.pdf}, confidential = {n}, abstract = {In order to execute applications under real-time constraints on many-core processors with a Network-on-Chip (NoC), guaranteed service (GS) communication with guaranteed end-to-end latency and bandwidth is required. Several hardware-based solutions for GS communication have been proposed in literature. However, commercially available many-core processors, e.g., Tilera's Tile Pro64 or Adapt Eva's Epiphany, do not support such features. In this paper, we propose a software solution that allows GS communication on 2D-mesh packet-switching NoCs. Our investigation is based on a hardware model that is applicable to commercially available processors, which include multiple NoCs to separate request and response packets and support only best-effort communication. We prove that a common upper bound of the injection rate for all sources limits the congestion which leads to an upper bound of the worst-case transmission latency (WCTL) for any transmission, i.e., the combination of a request and a response packet. Furthermore, our approach supports arbitrary transmission streams that can be modified at runtime without violating the upper bound of the WCTL, as long as the injection rate is not violated. This enables adaptive features such as task migration or dynamic scheduling policies. Experiments evaluate our solution for different traffic patterns.}, }In order to execute applications under real-time constraints on many-core processors with a Network-on-Chip (NoC), guaranteed service (GS) communication with guaranteed end-to-end latency and bandwidth is required. Several hardware-based solutions for GS communication have been proposed in literature. However, commercially available many-core processors, e.g., Tilera's Tile Pro64 or Adapt Eva's Epiphany, do not support such features. In this paper, we propose a software solution that allows GS communication on 2D-mesh packet-switching NoCs. Our investigation is based on a hardware model that is applicable to commercially available processors, which include multiple NoCs to separate request and response packets and support only best-effort communication. We prove that a common upper bound of the injection rate for all sources limits the congestion which leads to an upper bound of the worst-case transmission latency (WCTL) for any transmission, i.e., the combination of a request and a response packet. Furthermore, our approach supports arbitrary transmission streams that can be modified at runtime without violating the upper bound of the WCTL, as long as the injection rate is not violated. This enables adaptive features such as task migration or dynamic scheduling policies. Experiments evaluate our solution for different traffic patterns. Santiago Pagani, Jian-Jia Chen, Muhammad Shafique and Jörg Henkel.MatEx: efficient transient and peak temperature computation for compact thermal models. In Proceedings of the 2015 Design, Automation & Test in Europe Conference & Exhibition, (DATE), pages 1515--1520Grenoble, France, , March 9-13 2015[BibTeX][PDF][Abstract]@inproceedings { DBLP:conf/date/PaganiCSH15, author = {Pagani, Santiago and Chen, Jian-Jia and Shafique, Muhammad and Henkel, J\"org}, title = {MatEx: efficient transient and peak temperature computation for compact thermal models}, booktitle = {Proceedings of the 2015 Design, Automation \& Test in Europe Conference \& Exhibition, (DATE)}, year = {2015}, pages = {1515--1520}, address = {Grenoble, France, }, month = {March 9-13}, file = {http://cesweb.itec.kit.edu/~pagani/pubs/Pagani-DATE-2015-MatEx.pdf}, confidential = {n}, abstract = {In many core systems, run-time scheduling decisions, such as task migration, core activations/deactivations, voltage/frequency scaling, etc., are typically used to optimize the resource usages. Such run-time decisions change the power consumption, which can in turn result in transient temperatures much higher than any steady-state scenarios. Therefore, to be thermally safe, it is important to evaluate the transient peaks before making resource management decisions. This paper presents a method for computing these transient peaks in just a few milliseconds, which is suited for run-time usage. This technique works for any compact thermal model consisting in a system of first-order differential equations, for example, RC thermal networks. Instead of using regular numerical methods, our algorithm is based on analytically solving the differential equations using matrix exponentials and linear algebra. This results in a mathematical expression which can easily be analyzed and differentiated to compute the maximum transient temperatures. Moreover, our method can also be used to efficiently compute all transient temperatures for any given time resolution without accuracy losses. We implement our solution as an open-source tool called MatEx. Our experimental evaluations show that the execution time of MatEx for peak temperature computation can be bounded to no more than 2.5 ms for systems with 76 thermal nodes, and to no more than 26.6 ms for systems with 268 thermal nodes, which is three orders of magnitude faster than the state-of-the-art for the same settings.}, }In many core systems, run-time scheduling decisions, such as task migration, core activations/deactivations, voltage/frequency scaling, etc., are typically used to optimize the resource usages. Such run-time decisions change the power consumption, which can in turn result in transient temperatures much higher than any steady-state scenarios. Therefore, to be thermally safe, it is important to evaluate the transient peaks before making resource management decisions. This paper presents a method for computing these transient peaks in just a few milliseconds, which is suited for run-time usage. This technique works for any compact thermal model consisting in a system of first-order differential equations, for example, RC thermal networks. Instead of using regular numerical methods, our algorithm is based on analytically solving the differential equations using matrix exponentials and linear algebra. This results in a mathematical expression which can easily be analyzed and differentiated to compute the maximum transient temperatures. Moreover, our method can also be used to efficiently compute all transient temperatures for any given time resolution without accuracy losses. We implement our solution as an open-source tool called MatEx. Our experimental evaluations show that the execution time of MatEx for peak temperature computation can be bounded to no more than 2.5 ms for systems with 76 thermal nodes, and to no more than 26.6 ms for systems with 268 thermal nodes, which is three orders of magnitude faster than the state-of-the-art for the same settings. Santiago Pagani, Muhammad Shafique, Heba Khdr, Jian-Jia Chen and Jörg Henkel.seBoost: Selective Boosting for Heterogeneous Manycores. In International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS)Amsterdam, Netherlands, October 4-9 2015[BibTeX][PDF][Abstract]@inproceedings { DBLP:conf/codes/PaganiCSH15, author = {Pagani, Santiago and Shafique, Muhammad and Khdr, Heba and Chen, Jian-Jia and Henkel, J\"org}, title = {seBoost: Selective Boosting for Heterogeneous Manycores}, booktitle = {International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS)}, year = {2015}, address = {Amsterdam, Netherlands}, month = {October 4-9}, file = {http://cesweb.itec.kit.edu/~pagani/pubs/Pagani-DATE-2015-MatEx.pdf}, confidential = {n}, abstract = {Boosting techniques have been widely adopted in commercial multicore and manycore systems, mainly because they provide means to satisfy performance requirements surges, for one or more cores, at run-time. Current boosting techniques select the boosting levels (for boosted cores) and the throttle-down levels (for non-boosted cores) either arbitrarily or through step-wise control approaches. These methods might result in unnecessary performance losses for the non-boosted cores, in short boosting intervals, in failing to satisfy the required performance surges, or in necessary high power and energy consumption. This paper presents an efficient and lightweight run-time boosting technique based on transient temperature estimation, called seBoost. Our technique guarantees meeting the performance requirements surges at run-time, thus maximizing the boosting time with a minimum loss of performance for the non-boosted cores.}, }Boosting techniques have been widely adopted in commercial multicore and manycore systems, mainly because they provide means to satisfy performance requirements surges, for one or more cores, at run-time. Current boosting techniques select the boosting levels (for boosted cores) and the throttle-down levels (for non-boosted cores) either arbitrarily or through step-wise control approaches. These methods might result in unnecessary performance losses for the non-boosted cores, in short boosting intervals, in failing to satisfy the required performance surges, or in necessary high power and energy consumption. This paper presents an efficient and lightweight run-time boosting technique based on transient temperature estimation, called seBoost. Our technique guarantees meeting the performance requirements surges at run-time, thus maximizing the boosting time with a minimum loss of performance for the non-boosted cores. Jing Li, Jian Jia Chen, Kunal Agrawal, Chenyang Lu, Chris Gill and Abusayeed Saifullah.Analysis of Federated and Global Scheduling for Parallel tasks. In Proceedings of the 26th Euromicro Conference on Real-Time Systems, Madrid, Spain, July 8-11, 2014 2014[BibTeX][Abstract]@inproceedings { Li:ecrts14, author = {Li, Jing and Chen, Jian Jia and Agrawal, Kunal and Lu, Chenyang and Gill, Chris and Saifullah, Abusayeed}, title = {Analysis of Federated and Global Scheduling for Parallel tasks}, booktitle = {Proceedings of the 26th Euromicro Conference on Real-Time Systems, Madrid, Spain, July 8-11, 2014}, year = {2014}, confidential = {n}, abstract = {This paper considers the scheduling of parallel real-time tasks with implicit deadlines. Each parallel task is characterized as a general directed acyclic graph (DAG). We analyze three different real-time scheduling strategies: two well known algorithms, namely global earliest-deadline-first and global rate-monotonic, and one new algorithm, namely federated scheduling. The federated scheduling algorithm proposed in this paper is a generalization of partitioned scheduling to parallel tasks. In this strategy, each high-utilization task (utilization $\ge 1$) is assigned a set of dedicated cores and the remaining low-utilization tasks share the remaining cores. We prove capacity augmentation bounds for all three schedulers. In particular, we show that if on unit-speed cores, a task set has total utilization of at most $m$ and the critical-path length of each task is smaller than its deadline, then federated scheduling can schedule that task set on $m$ cores of speed 2; G-EDF can schedule it with speed $\frac{3+\sqrt{5}}{2} \approx 2.618$; and G-RM can schedule it with speed $2+\sqrt{3}\approx 3.732$. We also provide lower bounds on the speedup and show that the bounds are tight for federated scheduling and G-EDF when $m$ is sufficiently large.}, }This paper considers the scheduling of parallel real-time tasks with implicit deadlines. Each parallel task is characterized as a general directed acyclic graph (DAG). We analyze three different real-time scheduling strategies: two well known algorithms, namely global earliest-deadline-first and global rate-monotonic, and one new algorithm, namely federated scheduling. The federated scheduling algorithm proposed in this paper is a generalization of partitioned scheduling to parallel tasks. In this strategy, each high-utilization task (utilization $\ge 1$) is assigned a set of dedicated cores and the remaining low-utilization tasks share the remaining cores. We prove capacity augmentation bounds for all three schedulers. In particular, we show that if on unit-speed cores, a task set has total utilization of at most $m$ and the critical-path length of each task is smaller than its deadline, then federated scheduling can schedule that task set on $m$ cores of speed 2; G-EDF can schedule it with speed $3+\sqrt{5}{2} \approx 2.618$; and G-RM can schedule it with speed $2+3\approx 3.732$. We also provide lower bounds on the speedup and show that the bounds are tight for federated scheduling and G-EDF when $m$ is sufficiently large. Cong Liu and Jian-Jia Chen.Bursty-Interference Analysis Techniques for Analyzing Complex Real-Time Task Models. In Proceedings of the 35th IEEE Real-Time Systems Symposium (RTSS), Rome, Italy, December 2-5, 2014 2014[BibTeX][Abstract]@inproceedings { Liu:RTSS14a, author = {Liu, Cong and Chen, Jian-Jia}, title = {Bursty-Interference Analysis Techniques for Analyzing Complex Real-Time Task Models}, booktitle = {Proceedings of the 35th IEEE Real-Time Systems Symposium (RTSS), Rome, Italy, December 2-5, 2014}, year = {2014}, confidential = {n}, abstract = {Due to the recent trend towards building complex real-time cyber-physical systems, system designers need to develop and choose expressive formal models for representing such systems, as the model should be adequately expressive such that it can accurately convey the relevant characteristics of the system being modeled. Compared to the classical sporadic task model, there exist a number of real-time task models that are more expressive. However, such models are often complex and thus are rather difficult to be analyzed efficiently. Due to this reason, prior analysis methods for dealing with such complex task models are pessimistic. In this paper, a novel analysis technique, namely the bursty-interference analysis, is presented for analyzing two common expressive real-time task models, the general self-suspending task model and the deferrable server task model. This technique is used to derive new uniprocessor utilization-based schedulability tests and rate-monotonic utilization bounds for the two considered task models scheduled under rate-monotonic scheduling. Extensive experiments presented herein show that our proposed tests improve upon prior tests in all scenarios, in many cases by a wide margin. To the best of our knowledge, these are the first analysis techniques that can efficiently handle the general self-suspending and deferrable server task models on uniprocessors.}, }Due to the recent trend towards building complex real-time cyber-physical systems, system designers need to develop and choose expressive formal models for representing such systems, as the model should be adequately expressive such that it can accurately convey the relevant characteristics of the system being modeled. Compared to the classical sporadic task model, there exist a number of real-time task models that are more expressive. However, such models are often complex and thus are rather difficult to be analyzed efficiently. Due to this reason, prior analysis methods for dealing with such complex task models are pessimistic. In this paper, a novel analysis technique, namely the bursty-interference analysis, is presented for analyzing two common expressive real-time task models, the general self-suspending task model and the deferrable server task model. This technique is used to derive new uniprocessor utilization-based schedulability tests and rate-monotonic utilization bounds for the two considered task models scheduled under rate-monotonic scheduling. Extensive experiments presented herein show that our proposed tests improve upon prior tests in all scenarios, in many cases by a wide margin. To the best of our knowledge, these are the first analysis techniques that can efficiently handle the general self-suspending and deferrable server task models on uniprocessors. Cong Liu, Jian-Jia Chen, Liang He and Yu Gu.Analysis Techniques for Supporting Harmonic Real-Time Tasks with Suspensions. In Proceedings of the 26th Euromicro Conference on Real-Time Systems, Madrid, Spain, July 8-11, 2014 2014[BibTeX][Abstract]@inproceedings { Liu:ecrts14, author = {Liu, Cong and Chen, Jian-Jia and He, Liang and Gu, Yu}, title = {Analysis Techniques for Supporting Harmonic Real-Time Tasks with Suspensions}, booktitle = {Proceedings of the 26th Euromicro Conference on Real-Time Systems, Madrid, Spain, July 8-11, 2014}, year = {2014}, confidential = {n}, abstract = {In many real-time systems, tasks may experience suspension delays when they block to access shared resources or interact with external devices such as I/O. It is known that such suspensions delays may negatively impact schedulability. Particularly in hard real-time systems, a few negative results exist on analyzing the schedulability of such systems, even for very restricted suspending task models on a uniprocessor. In this paper, we focus on the particular case of hard real-time suspending task systems with harmonic periods, which is a special case of practical relevance. We propose a new uniprocessor suspension-aware analysis technique for supporting such task systems under rate-monotonic scheduling. Our analysis technique is able to achieve only Θ(1) suspension-related utilization loss on a uniprocessor.Based upon this technique, we further propose a partitioning scheme that supports suspending task systems with harmonic periods on multiprocessors. The resulting schedulability test shows that compared to existing schedulability tests designed for ordinary non-suspending task systems, suspensions only results in Θ(m) additional suspension-related utilization loss, where m is the number of processors. Furthermore, experiments presented herein show that both our uniprocessor and multiprocessor schedulability tests improve upon prior approaches by a significant margin.}, }In many real-time systems, tasks may experience suspension delays when they block to access shared resources or interact with external devices such as I/O. It is known that such suspensions delays may negatively impact schedulability. Particularly in hard real-time systems, a few negative results exist on analyzing the schedulability of such systems, even for very restricted suspending task models on a uniprocessor. In this paper, we focus on the particular case of hard real-time suspending task systems with harmonic periods, which is a special case of practical relevance. We propose a new uniprocessor suspension-aware analysis technique for supporting such task systems under rate-monotonic scheduling. Our analysis technique is able to achieve only Θ(1) suspension-related utilization loss on a uniprocessor.Based upon this technique, we further propose a partitioning scheme that supports suspending task systems with harmonic periods on multiprocessors. The resulting schedulability test shows that compared to existing schedulability tests designed for ordinary non-suspending task systems, suspensions only results in Θ(m) additional suspension-related utilization loss, where m is the number of processors. Furthermore, experiments presented herein show that both our uniprocessor and multiprocessor schedulability tests improve upon prior approaches by a significant margin. Jian-Jia Chen and Cong Liu.Fixed-Relative-Deadline Scheduling of Hard Real-Time Tasks with Self-Suspensions. In Proceedings of the 35th IEEE Real-Time Systems Symposium (RTSS), Rome, Italy, December 2-5, 2014 2014, We identified a typo in the schedulability test in Theorem 3 on 13, May, 2015. Short summary [BibTeX][Abstract]@inproceedings { Chen:RTSS14a, author = {Chen, Jian-Jia and Liu, Cong}, title = {Fixed-Relative-Deadline Scheduling of Hard Real-Time Tasks with Self-Suspensions}, booktitle = {Proceedings of the 35th IEEE Real-Time Systems Symposium (RTSS), Rome, Italy, December 2-5, 2014}, year = {2014}, note = {We identified a typo in the schedulability test in Theorem 3 on 13, May, 2015. Short summary }, confidential = {n}, abstract = {In many real-time systems, tasks may experience self-suspension delays when accessing external devices. The problem of scheduling such self-suspending tasks to meet hard deadlines on a uniprocessor is known to be $\mathcal{NP}$-hard in the strong sense. Current solutions including the common suspension-oblivious approach of treating all suspensions as computation can be quite pessimistic. This paper shows that another category of scheduling algorithms, namely fixed-relative-deadline (FRD) scheduling, may yield better performance than classical schedulers such as EDF and RM, for real-time tasks that may experience one self-suspension during the execution of a task instance. We analyze a simple FRD algorithm, namely EDA, and derive corresponding pseudo-polynomial-time and linear-time schedulability tests. To analyze the quality of EDA and its schedulability tests, we analyze their resource augmentation factors, with respect to the speed-up factor that is needed to ensure the schedulability and feasibility of the resulting schedule. Specifically, the speed-up factor of EDA is $2$ and $3$, when referring to the optimal FRD scheduling and any feasible arbitrary scheduling, respectively. Moreover, the speed-up factor of the proposed linear-time schedulability test is $2.787$ and $4.875$, when referring to the optimal FRD scheduling and any feasible arbitrary scheduling, respectively. Furthermore, extensive experiments presented herein show that our proposed linear-time schedulability test improves upon prior approaches by a significant margin. To our best knowledge, for the scheduling of self-suspending tasks, these are the first results of any sort that indicate it might be possible to design good approximation algorithms.}, }In many real-time systems, tasks may experience self-suspension delays when accessing external devices. The problem of scheduling such self-suspending tasks to meet hard deadlines on a uniprocessor is known to be $NP$-hard in the strong sense. Current solutions including the common suspension-oblivious approach of treating all suspensions as computation can be quite pessimistic. This paper shows that another category of scheduling algorithms, namely fixed-relative-deadline (FRD) scheduling, may yield better performance than classical schedulers such as EDF and RM, for real-time tasks that may experience one self-suspension during the execution of a task instance. We analyze a simple FRD algorithm, namely EDA, and derive corresponding pseudo-polynomial-time and linear-time schedulability tests. To analyze the quality of EDA and its schedulability tests, we analyze their resource augmentation factors, with respect to the speed-up factor that is needed to ensure the schedulability and feasibility of the resulting schedule. Specifically, the speed-up factor of EDA is $2$ and $3$, when referring to the optimal FRD scheduling and any feasible arbitrary scheduling, respectively. Moreover, the speed-up factor of the proposed linear-time schedulability test is $2.787$ and $4.875$, when referring to the optimal FRD scheduling and any feasible arbitrary scheduling, respectively. Furthermore, extensive experiments presented herein show that our proposed linear-time schedulability test improves upon prior approaches by a significant margin. To our best knowledge, for the scheduling of self-suspending tasks, these are the first results of any sort that indicate it might be possible to design good approximation algorithms. Santiago Pagani, Heba Khdr, Waqaas Munawar, Jian-Jia Chen, Muhammad Shafique, Minming Li and Jörg Henkel.{TSP}: Thermal Safe Power - Efficient power budgeting for Many-Core Systems in Dark Silicon. In IEEE/ACM International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS) New Delhi, India, October 2014 2014, Best Paper Award, TSP tool is available here[BibTeX][Abstract]@inproceedings { Pagani-TSP14, author = {Pagani, Santiago and Khdr, Heba and Munawar, Waqaas and Chen, Jian-Jia and Shafique, Muhammad and Li, Minming and Henkel, J\"org}, title = {{TSP}: Thermal Safe Power - Efficient power budgeting for Many-Core Systems in Dark Silicon}, booktitle = {IEEE/ACM International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS) New Delhi, India, October 2014}, year = {2014}, note = {Best Paper Award, TSP tool is available here}, confidential = {n}, abstract = {Chip manufacturers provide the Thermal Design Power (TDP) for a specific chip. The cooling solution is designed to dissipate this power level. But because TDP is not necessarily the maximum power that can be applied, chips are operated with Dynamic Thermal Management (DTM) techniques. To avoid excessive triggers of DTM, usually, system designers also use TDP as power constraint. However, using a single and constant value as power constraint, e.g., TDP, can result in big performance losses in many-core systems. Having better power budgeting techniques is a major step towards dealing with the dark silicon problem. This paper presents a new power budget concept, called Thermal Safe Power (TSP), which is an abstraction that provides safe power constraint values as a function of the number of simultaneously operating cores. Executing cores at any power consumption below TSP ensures that DTM is not triggered. TSP can be computed offline for the worst cases, or online for a particular mapping of cores. Our simulations show that using TSP as power constraint results in 50.5\% and 14.2\% higher average performance, compared to using constant power budgets (both per-chip and per-core) and a boosting technique, respectively. Moreover, TSP results in dark silicon estimations which are more optimistic than estimations using constant power budgets}, }Chip manufacturers provide the Thermal Design Power (TDP) for a specific chip. The cooling solution is designed to dissipate this power level. But because TDP is not necessarily the maximum power that can be applied, chips are operated with Dynamic Thermal Management (DTM) techniques. To avoid excessive triggers of DTM, usually, system designers also use TDP as power constraint. However, using a single and constant value as power constraint, e.g., TDP, can result in big performance losses in many-core systems. Having better power budgeting techniques is a major step towards dealing with the dark silicon problem. This paper presents a new power budget concept, called Thermal Safe Power (TSP), which is an abstraction that provides safe power constraint values as a function of the number of simultaneously operating cores. Executing cores at any power consumption below TSP ensures that DTM is not triggered. TSP can be computed offline for the worst cases, or online for a particular mapping of cores. Our simulations show that using TSP as power constraint results in 50.5% and 14.2% higher average performance, compared to using constant power budgets (both per-chip and per-core) and a boosting technique, respectively. Moreover, TSP results in dark silicon estimations which are more optimistic than estimations using constant power budgets Waqaas Munawar, Heba Khdr, Santiago Pagani, Muhammad Shafique, Jian-Jia Chen and Jörg Henkel.Peak Power Management for Scheduling Real-time Tasks on Heterogeneous Many-Core Systems. In The 20th IEEE International Conference on Parallel and Distributed Systems, (ICPADS), Hsinchu, Taiwan, Dec 16-19, 2014 2014[BibTeX][PDF][Abstract]@inproceedings { munawarPeak14, author = {Munawar, Waqaas and Khdr, Heba and Pagani, Santiago and Shafique, Muhammad and Chen, Jian-Jia and Henkel, J{\"o}rg}, title = {Peak Power Management for Scheduling Real-time Tasks on Heterogeneous Many-Core Systems}, booktitle = {The 20th IEEE International Conference on Parallel and Distributed Systems, (ICPADS), Hsinchu, Taiwan, Dec 16-19, 2014}, year = {2014}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2014-munawar-icpads.pdf}, confidential = {n}, abstract = {The number and diversity of cores in on-chip systems is increasing rapidly. However, due to the Thermal Design Power (TDP) constraint, it is not possible to continuously operate all cores at the same time. Exceeding the TDP constraint may activate the Dynamic Thermal Management (DTM) to ensure thermal stability. Such hardware based closed-loop safeguards pose a big challenge in using many-core chips for real-time tasks. Managing the worst-case peak power usage of a chip can help toward resolving this issue. We present a scheme to minimize the peak power usage for frame-based and periodic real-time tasks on many-core processors by scheduling the sleep cycles for each active core and introduce the concept of a sufficient test for peak power consumption for task feasibility. We consider both inter-task and inter-core diversity in terms of power usage and present computationally efficient algorithms for peak power minimization for these cases, i.e., a special case of homogeneous tasks on homogeneous cores to the general case of heterogeneous tasks on heterogeneous cores. We evaluate our solution through extensive simulations using the 48-core SCC platform and gem5 architecture simulator. Our simulation results show the efficacy of our scheme.}, }The number and diversity of cores in on-chip systems is increasing rapidly. However, due to the Thermal Design Power (TDP) constraint, it is not possible to continuously operate all cores at the same time. Exceeding the TDP constraint may activate the Dynamic Thermal Management (DTM) to ensure thermal stability. Such hardware based closed-loop safeguards pose a big challenge in using many-core chips for real-time tasks. Managing the worst-case peak power usage of a chip can help toward resolving this issue. We present a scheme to minimize the peak power usage for frame-based and periodic real-time tasks on many-core processors by scheduling the sleep cycles for each active core and introduce the concept of a sufficient test for peak power consumption for task feasibility. We consider both inter-task and inter-core diversity in terms of power usage and present computationally efficient algorithms for peak power minimization for these cases, i.e., a special case of homogeneous tasks on homogeneous cores to the general case of heterogeneous tasks on heterogeneous cores. We evaluate our solution through extensive simulations using the 48-core SCC platform and gem5 architecture simulator. Our simulation results show the efficacy of our scheme. Anas Toma, Jian-Jia Chen and Wei Liu.Computation Offloading for Sporadic Real-Time Tasks. In 20th IEEE International Conference on Embedded and Real-Time Computing Systems and Applications (RTCSA), Chongqing, China, August 2014 2014[BibTeX][PDF][Abstract]@inproceedings { TomaCL-RTCSA14, author = {Toma, Anas and Chen, Jian-Jia and Liu, Wei}, title = {Computation Offloading for Sporadic Real-Time Tasks}, booktitle = {20th IEEE International Conference on Embedded and Real-Time Computing Systems and Applications (RTCSA), Chongqing, China, August 2014}, year = {2014}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2014-toma-rtcsa.pdf}, confidential = {n}, abstract = {The applications of the mobile devices are increasingly being improved. They include computation-intensive tasks, such as video and audio processing. However, the mobile devices have limited resources, which may make it difficult to finish these tasks in time. Computation offloading can be used to boost the capabilities of these resource-constrained devices, where the computation-intensive tasks are moved to a powerful remote processing unit. This paper considers the computation offloading problem for sporadic real-time tasks. The total bandwidth server (TBS) is adopted on the remote processing unit (the server side) for resource reservation. On the client side, a dynamic programming algorithm is proposed to determine the offloading decision of the tasks such that their schedule is feasible (i.e., all the tasks meet their deadlines). The algorithm is evaluated using a case study of surveillance system and synthesized benchmarks.}, }The applications of the mobile devices are increasingly being improved. They include computation-intensive tasks, such as video and audio processing. However, the mobile devices have limited resources, which may make it difficult to finish these tasks in time. Computation offloading can be used to boost the capabilities of these resource-constrained devices, where the computation-intensive tasks are moved to a powerful remote processing unit. This paper considers the computation offloading problem for sporadic real-time tasks. The total bandwidth server (TBS) is adopted on the remote processing unit (the server side) for resource reservation. On the client side, a dynamic programming algorithm is proposed to determine the offloading decision of the tasks such that their schedule is feasible (i.e., all the tasks meet their deadlines). The algorithm is evaluated using a case study of surveillance system and synthesized benchmarks. Helena Kotthaus, Ingo Korb, Markus Künne and Peter Marwedel.Performance Analysis for R: Towards a Faster R Interpreter. In Abstract Booklet of the International R User Conference (UseR!), pages 104 Los Angeles, USA, July 2014[BibTeX][Link]@inproceedings { kotthaus/2014b, author = {Kotthaus, Helena and Korb, Ingo and K\"unne, Markus and Marwedel, Peter}, title = {Performance Analysis for R: Towards a Faster R Interpreter}, booktitle = {Abstract Booklet of the International R User Conference (UseR!)}, year = {2014}, pages = {104}, address = { Los Angeles, USA}, month = {jul}, url = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/kotthaus_user2014.pdf}, confidential = {n}, } Helena Kotthaus, Ingo Korb, Michael Engel and Peter Marwedel.Dynamic Page Sharing Optimization for the R Language . In Proceedings of the 10th Symposium on Dynamic Languages, pages 79--90Portland, Oregon, USA, October 2014[BibTeX][PDF][Link][Abstract]@inproceedings { kotthaus/2014e, author = {Kotthaus, Helena and Korb, Ingo and Engel, Michael and Marwedel, Peter}, title = {Dynamic Page Sharing Optimization for the R Language }, booktitle = {Proceedings of the 10th Symposium on Dynamic Languages}, year = {2014}, series = {DLS '14}, pages = {79--90}, address = {Portland, Oregon, USA}, month = {oct}, publisher = {ACM}, url = {http://dl.acm.org/citation.cfm?id=2661094}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2014e_kotthaus.pdf}, confidential = {n}, abstract = { Dynamic languages such as R are increasingly used to process .large data sets. Here, the R interpreter induces a large memory overhead due to wasteful memory allocation policies. If an application's working set exceeds the available physical memory, the OS starts to swap, resulting in slowdowns of a several orders of magnitude. Thus, memory optimizations for R will be beneficial to many applications. Existing R optimizations are mostly based on dynamic compilation or native libraries. Both methods are futile when the OS starts to page out memory. So far, only a few, data-type or application specific memory optimizations for R exist. To remedy this situation, we present a low-overhead page sharing approach for R that significantly reduces the interpreter's memory overhead. Concentrating on the most rewarding optimizations avoids the high runtime overhead of existing generic approaches for memory deduplication or compression. In addition, by applying knowledge of interpreter data structures and memory allocation patterns, our approach is not constrained to specific R applications and is transparent to the R interpreter. Our page sharing optimization enables us to reduce the memory consumption by up to 53.5% with an average of 18.0% for a set of real-world R benchmarks with a runtime overhead of only 5.3% on average. In cases where page I/O can be avoided, significant speedups are achieved. }, } Dynamic languages such as R are increasingly used to process .large data sets. Here, the R interpreter induces a large memory overhead due to wasteful memory allocation policies. If an application's working set exceeds the available physical memory, the OS starts to swap, resulting in slowdowns of a several orders of magnitude. Thus, memory optimizations for R will be beneficial to many applications. Existing R optimizations are mostly based on dynamic compilation or native libraries. Both methods are futile when the OS starts to page out memory. So far, only a few, data-type or application specific memory optimizations for R exist. To remedy this situation, we present a low-overhead page sharing approach for R that significantly reduces the interpreter's memory overhead. Concentrating on the most rewarding optimizations avoids the high runtime overhead of existing generic approaches for memory deduplication or compression. In addition, by applying knowledge of interpreter data structures and memory allocation patterns, our approach is not constrained to specific R applications and is transparent to the R interpreter. Our page sharing optimization enables us to reduce the memory consumption by up to 53.5% with an average of 18.0% for a set of real-world R benchmarks with a runtime overhead of only 5.3% on average. In cases where page I/O can be avoided, significant speedups are achieved. Chen-Wei Huang, Timon Kelter, Bjoern Boenninghoff, Jan Kleinsorge, Michael Engel, Peter Marwedel and Shiao-Li Tsao.Static WCET Analysis of the H.264/AVC Decoder Exploiting Coding Information. In International Conference on Embedded and Real-Time Computing Systems and ApplicationsChongqing, China, August 2014[BibTeX]@inproceedings { huang:2014:rtcsa, author = {Huang, Chen-Wei and Kelter, Timon and Boenninghoff, Bjoern and Kleinsorge, Jan and Engel, Michael and Marwedel, Peter and Tsao, Shiao-Li}, title = {Static WCET Analysis of the H.264/AVC Decoder Exploiting Coding Information}, booktitle = {International Conference on Embedded and Real-Time Computing Systems and Applications}, year = {2014}, address = {Chongqing, China}, month = {August}, organization = {IEEE}, keywords = {wcet}, confidential = {n}, } Andreas Heinig, Florian Schmoll, Peter Marwedel and Michael Engel.Who's Using that Memory? A Subscriber Model for Mapping Errors to Tasks. In Proceedings of the 10th Workshop on Silicon Errors in Logic - System Effects (SELSE)Stanford, CA, USA, April 2014[BibTeX][PDF][Abstract]@inproceedings { heinig:2014:SELSE, author = {Heinig, Andreas and Schmoll, Florian and Marwedel, Peter and Engel, Michael}, title = {Who's Using that Memory? A Subscriber Model for Mapping Errors to Tasks}, booktitle = {Proceedings of the 10th Workshop on Silicon Errors in Logic - System Effects (SELSE)}, year = {2014}, address = {Stanford, CA, USA}, month = {April}, keywords = {ders}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2014-heinig-selse2014.pdf}, confidential = {n}, abstract = {In order to assess the robustness of software-based fault-tolerance methods, extensive tests have to be performed that inject faults, such as bit flips, into hardware components of a running system. Fault injection commonly uses either system simulations, resulting in execution times orders of magnitude longer than on real systems, or exposes a real system to error sources like radiation. This can take place in real time, but it enables only a very coarse-grained control over the affected system component. A solution combining the best characteristics from both approaches should achieve precise fault injection in real hardware systems. The approach presented in this paper uses the JTAG background debug facility of a CPU to inject faults into main memory and registers of a running system. Compared to similar earlier approaches, our solution is able to achieve rapid fault injection using a low-cost microcontroller instead of a complex FPGA. Consequently, our injection software is much more flexible. It allows to restrict error injection to the execution of a set of predefined components, resulting in a more precise control of the injection, and also emulates error reporting, which enables the evaluation of different error detection approaches in addition to robustness evaluation. }, }In order to assess the robustness of software-based fault-tolerance methods, extensive tests have to be performed that inject faults, such as bit flips, into hardware components of a running system. Fault injection commonly uses either system simulations, resulting in execution times orders of magnitude longer than on real systems, or exposes a real system to error sources like radiation. This can take place in real time, but it enables only a very coarse-grained control over the affected system component. A solution combining the best characteristics from both approaches should achieve precise fault injection in real hardware systems. The approach presented in this paper uses the JTAG background debug facility of a CPU to inject faults into main memory and registers of a running system. Compared to similar earlier approaches, our solution is able to achieve rapid fault injection using a low-cost microcontroller instead of a complex FPGA. Consequently, our injection software is much more flexible. It allows to restrict error injection to the execution of a set of predefined components, resulting in a more precise control of the injection, and also emulates error reporting, which enables the evaluation of different error detection approaches in addition to robustness evaluation. Timon Kelter and Peter Marwedel.Parallelism Analysis: Precise WCET Values for Complex Multi-Core Systems. In Third International Workshop on Formal Techniques for Safety-Critical SystemsLuxembourg, November 2014[BibTeX][PDF][Link]@inproceedings { kelter:2014:ftscs, author = {Kelter, Timon and Marwedel, Peter}, title = {Parallelism Analysis: Precise WCET Values for Complex Multi-Core Systems}, booktitle = {Third International Workshop on Formal Techniques for Safety-Critical Systems}, year = {2014}, editor = {Cyrille Artho and Peter \"Olveczky}, series = {FTSCS}, address = {Luxembourg}, month = {November}, publisher = {Springer}, url = {http://www.ftscs.org/index.php?n=Main.Home}, keywords = {wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2014-kelter-ftscs.pdf}, confidential = {n}, } Timon Kelter, Peter Marwedel and Hendrik Borghorst.WCET-aware Scheduling Optimizations for Multi-Core Real-Time Systems. In International Conference on Embedded Computer Systems: Architectures, Modeling, and Simulation (SAMOS), pages 67-74Samos, Greece, July 2014[BibTeX][PDF]@inproceedings { kelter:2014:samos, author = {Kelter, Timon and Marwedel, Peter and Borghorst, Hendrik}, title = {WCET-aware Scheduling Optimizations for Multi-Core Real-Time Systems}, booktitle = {International Conference on Embedded Computer Systems: Architectures, Modeling, and Simulation (SAMOS)}, year = {2014}, pages = {67-74}, address = {Samos, Greece}, month = {July}, keywords = {wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2014-samos.pdf}, confidential = {n}, } Bjoern Dusza, Peter Marwedel, Olaf Spinczyk and Christian Wietfeld.A Context-Aware Battery Lifetime Model for Carrier Aggregation Enabled LTE-A Systems. In IEEE Consumer Communications and Networking ConferenceLas Vegas, USA, January 2014[BibTeX][Abstract]@inproceedings { marwedel:2014:ccnc, author = {Dusza, Bjoern and Marwedel, Peter and Spinczyk, Olaf and Wietfeld, Christian}, title = {A Context-Aware Battery Lifetime Model for Carrier Aggregation Enabled LTE-A Systems}, booktitle = {IEEE Consumer Communications and Networking Conference}, year = {2014}, series = {CCNC}, address = {Las Vegas, USA}, month = {January}, organization = {IEEE}, keywords = {energy}, confidential = {n}, abstract = {A Quality of Experience (QoE) parameter of increasing importance is the time that a battery powered communication device (e.g. smartphone) can be operated before it needs to be recharged. However, due to the fact that battery capacity is not evolving as fast as the power requirement, the battery lifetime of modern user equipment is stagnating or even decreasing from one device generation to another. In parallel, a major challenge for the design of next generation wireless systems such as LTE-Advanced (LTE-A) is that the required high portion of spectrum is not available in a consecutive portion. For this reason, a procedure called interband non-continuous Carrier Aggregation (CA) will be introduced in LTE-A which allows for the combination of multiple spectrum pieces from different frequency bands. This procedure however requires the parallel operation of multiple power amplifiers that are characterized by a high energy demand. In this paper, we quantify the impact of CA on the power consumption of LTE-A enabled communication by means of a Markovian based power consumption model that incorporates system parameters as well as context parameters. The results show that the suitability of CA does from a battery lifetime perspective strongly depend upon the actual device characteristics as well as the resource availability is the various frequency bands. Furthermore, the application of the sophisticated Kinetic Battery Model (KiBaM) shows that the charge recovery effect during idle periods does significantly affect the battery lifetime.}, }A Quality of Experience (QoE) parameter of increasing importance is the time that a battery powered communication device (e.g. smartphone) can be operated before it needs to be recharged. However, due to the fact that battery capacity is not evolving as fast as the power requirement, the battery lifetime of modern user equipment is stagnating or even decreasing from one device generation to another. In parallel, a major challenge for the design of next generation wireless systems such as LTE-Advanced (LTE-A) is that the required high portion of spectrum is not available in a consecutive portion. For this reason, a procedure called interband non-continuous Carrier Aggregation (CA) will be introduced in LTE-A which allows for the combination of multiple spectrum pieces from different frequency bands. This procedure however requires the parallel operation of multiple power amplifiers that are characterized by a high energy demand. In this paper, we quantify the impact of CA on the power consumption of LTE-A enabled communication by means of a Markovian based power consumption model that incorporates system parameters as well as context parameters. The results show that the suitability of CA does from a battery lifetime perspective strongly depend upon the actual device characteristics as well as the resource availability is the various frequency bands. Furthermore, the application of the sophisticated Kinetic Battery Model (KiBaM) shows that the charge recovery effect during idle periods does significantly affect the battery lifetime. Peter Marwedel and Michael Engel.Flipped classroom teaching for a cyber-physical system course - an adequate presence-based learning approach in the internet age. In Proceedings of the Tenth European Workshop on Microelectronics Education (EWME)Tallinn, Estonia, May 2014[BibTeX][PDF][Abstract]@inproceedings { marwedel:2014:ewme, author = {Marwedel, Peter and Engel, Michael}, title = {Flipped classroom teaching for a cyber-physical system course - an adequate presence-based learning approach in the internet age}, booktitle = {Proceedings of the Tenth European Workshop on Microelectronics Education (EWME)}, year = {2014}, address = {Tallinn, Estonia}, month = {May}, publisher = {IEEE}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2014-ewme.pdf}, confidential = {n}, abstract = {In the age of the Internet, teaching styles need to take new ways of learning into account. This paper recommends the use of the flipped classroom approach. In this approach, the roles of work at home and in class are essentially swapped. We present a case study covering a course on cyber-physical system fundamentals. Results are strongly encouraging us to continue along these lines. We are also commenting on general advantages and limitations of this style of teaching.}, }In the age of the Internet, teaching styles need to take new ways of learning into account. This paper recommends the use of the flipped classroom approach. In this approach, the roles of work at home and in class are essentially swapped. We present a case study covering a course on cyber-physical system fundamentals. Results are strongly encouraging us to continue along these lines. We are also commenting on general advantages and limitations of this style of teaching. Heinrich Müller Dominic Siedhoff.Signal/Background Classification of Time Series for Biological Virus Detection. In Pattern Recognition - 36th German Conference, GCPR 2014, Münster, Germany, September 2-5, 2014. Proceedings 2014[BibTeX]@inproceedings { Siedhoff/etal/2014b, author = {Dominic Siedhoff, Heinrich M\"uller}, title = {Signal/Background Classification of Time Series for Biological Virus Detection}, booktitle = {Pattern Recognition - 36th German Conference, GCPR 2014, M\"unster, Germany, September 2-5, 2014. Proceedings}, year = {2014}, editor = {Xiaoyi Jiang, Joachim Hornegger, Reinhard Koch}, publisher = {Springer}, confidential = {n}, } Olaf Neugebauer, Michael Engel and Peter Marwedel.A Parallelization Approach for Resource Restricted Embedded Heterogeneous MPSoCs Inspired by OpenMP. In Proceedings of Software Engineering for Parallel Systems (SEPS) 2014[BibTeX]@inproceedings { neugebauer:2014:seps, author = {Neugebauer, Olaf and Engel, Michael and Marwedel, Peter}, title = {A Parallelization Approach for Resource Restricted Embedded Heterogeneous MPSoCs Inspired by OpenMP}, booktitle = {Proceedings of Software Engineering for Parallel Systems (SEPS)}, year = {2014}, confidential = {n}, } Jan Kleinsorge and Peter Marwedel.Computing Maximum Blocking Times with Explicit Path Analysis under Non-local Flow Bounds. In Proceedings of the International Conference on Embedded Software (EMSOFT 2014)New Delhi, India, October 2014[BibTeX][Link]@inproceedings { Kleinsorge:2014:EMSOFT, author = {Kleinsorge, Jan and Marwedel, Peter}, title = {Computing Maximum Blocking Times with Explicit Path Analysis under Non-local Flow Bounds}, booktitle = {Proceedings of the International Conference on Embedded Software (EMSOFT 2014)}, year = {2014}, series = {EMSOFT 2014}, address = {New Delhi, India}, month = {oct}, url = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2014-jk.pdf}, confidential = {n}, } Wei Liu, Jian-Jia Chen, Anas Toma, Tei-Wei Kuo and Qingxu Deng.Computation Offloading by Using Timing Unreliable Components in Real-Time Systems. In Design Automation Conference (DAC), San Francisco, CA, USA, June 1-5 2014[BibTeX][PDF][Link][Abstract]@inproceedings { DBLP:conf/dac/LiuCTKD14, author = {Liu, Wei and Chen, Jian-Jia and Toma, Anas and Kuo, Tei-Wei and Deng, Qingxu}, title = {Computation Offloading by Using Timing Unreliable Components in Real-Time Systems}, booktitle = {Design Automation Conference (DAC), San Francisco, CA, USA, June 1-5}, year = {2014}, url = {http://doi.acm.org/10.1145/2593069.2593109}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2014-liu-dac.pdf}, confidential = {n}, abstract = {There are many timing unreliable computing components in modern computer systems, which are typically forbidden in hard real-time systems due to the timing uncertainty. In this paper, we propose a computation offloading mechanism to utilise these timing unreliable components in a hard real-time system, by providing local compensations. The key of the mechanism is to decide (1) how the unreliable components are utilized and (2) how to set the worst-case estimated response time. The local compensation has to start when the unreliable components do not deliver the results in the estimated response time. We propose a scheduling algorithm and its schedulability test to analyze the feasibility of the compensation mechanism. To validate the proposed mechanism, we perform a case study based on image-processing applications in a robot system and simulations. By adopting the timing unreliable components, the system can handle higher-quality images and with better performance.}, }There are many timing unreliable computing components in modern computer systems, which are typically forbidden in hard real-time systems due to the timing uncertainty. In this paper, we propose a computation offloading mechanism to utilise these timing unreliable components in a hard real-time system, by providing local compensations. The key of the mechanism is to decide (1) how the unreliable components are utilized and (2) how to set the worst-case estimated response time. The local compensation has to start when the unreliable components do not deliver the results in the estimated response time. We propose a scheduling algorithm and its schedulability test to analyze the feasibility of the compensation mechanism. To validate the proposed mechanism, we perform a case study based on image-processing applications in a robot system and simulations. By adopting the timing unreliable components, the system can handle higher-quality images and with better performance. Pascal Libuschewski, Dennis Kaulbars, Dominic Siedhoff, Frank Weichert, Heinrich Müller, Christian Wietfeld and Peter Marwedel.Multi-Objective Computation Offloading for Mobile Biosensors via LTE. In Wireless Mobile Communication and Healthcare (Mobihealth), 2014 EAI 4th International Conference on December 2014[BibTeX][PDF][Link][Abstract]@inproceedings { Libuschewski/etal/2014a, author = {Libuschewski, Pascal and Kaulbars, Dennis and Siedhoff, Dominic and Weichert, Frank and M\"uller, Heinrich and Wietfeld, Christian and Marwedel, Peter}, title = {Multi-Objective Computation Offloading for Mobile Biosensors via LTE}, booktitle = {Wireless Mobile Communication and Healthcare (Mobihealth), 2014 EAI 4th International Conference on}, year = {2014}, month = {Dec}, url = {http://dx.doi.org/10.4108/icst.mobihealth.2014.257374}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2014-mobihealth.pdf}, confidential = {n}, abstract = {For a rapid identification of viral epidemics a mobile virus detection is needed, which can process samples without a laboratory. The application of medical biosensors, at key positions with a high passenger volume (e.g. airports), became increasingly meaningful as epidemic early warning systems. As mobile biosensors have to fulfill various demands, like a rapid analysis and prolonged battery lifetime we present in this study a multi-objective computation offloading for mobile sensors. The decision whether it is beneficial to offload work to a server can be made automatically on the basis of contrary objectives and several constraints.}, }For a rapid identification of viral epidemics a mobile virus detection is needed, which can process samples without a laboratory. The application of medical biosensors, at key positions with a high passenger volume (e.g. airports), became increasingly meaningful as epidemic early warning systems. As mobile biosensors have to fulfill various demands, like a rapid analysis and prolonged battery lifetime we present in this study a multi-objective computation offloading for mobile sensors. The decision whether it is beneficial to offload work to a server can be made automatically on the basis of contrary objectives and several constraints. Pascal Libuschewski, Peter Marwedel, Dominic Siedhoff and Müller Heinrich.Multi-Objective Energy-Aware GPGPU Design Space Exploration for Medical or Industrial Applications. In Signal-Image Technology and Internet-Based Systems (SITIS), 2014 Tenth International Conference on, pages 637-644 November 2014, doi 10.1109/SITIS.2014.11[BibTeX][PDF][Link][Abstract]@inproceedings { Libuschewski/etal/2014b, author = {Libuschewski, Pascal and Marwedel, Peter and Siedhoff, Dominic and Heinrich, M\"uller}, title = {Multi-Objective Energy-Aware GPGPU Design Space Exploration for Medical or Industrial Applications}, booktitle = {Signal-Image Technology and Internet-Based Systems (SITIS), 2014 Tenth International Conference on}, year = {2014}, pages = {637-644}, month = {Nov}, publisher = {IEEE Computer Society}, note = {doi 10.1109/SITIS.2014.11}, url = {dx.doi.org/10.1109/SITIS.2014.11}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2014-sitis.pdf}, confidential = {n}, abstract = {This work presents a multi-objective design space exploration for Graphics Processing Units (GPUs). For any given GPGPU application, a Pareto front of best suited GPUs can be calculated. The objectives can be chosen according to the demands of the system, for example energy efficiency, run time and real-time capability. The simulated GPUs can be desktop, high performance or mobile versions. Also GPUs that do not yet exist can be modeled and simulated. The main application area for the presented approach is the identification of suitable GPU hardware for given medical or industrial applications, e.g. for real-time process control or in healthcare sensor environments. As use case a real-time capable medical biosensor program for an automatic detection of pathogens and a wide variety of industrial, biological and physical applications were evaluated.}, }This work presents a multi-objective design space exploration for Graphics Processing Units (GPUs). For any given GPGPU application, a Pareto front of best suited GPUs can be calculated. The objectives can be chosen according to the demands of the system, for example energy efficiency, run time and real-time capability. The simulated GPUs can be desktop, high performance or mobile versions. Also GPUs that do not yet exist can be modeled and simulated. The main application area for the presented approach is the identification of suitable GPU hardware for given medical or industrial applications, e.g. for real-time process control or in healthcare sensor environments. As use case a real-time capable medical biosensor program for an automatic detection of pathogens and a wide variety of industrial, biological and physical applications were evaluated. Yu-Ming Chang, Yuan-Hao Chang, Jian-Jia Chen, Tei-Wei Kuo, Hsiang-Pang Li and Hang-Ting Lue.On Trading Wear-leveling with Heal-leveling. In Design Automation Conference (DAC), San Francisco, CA, USA, June 1-5 2014, Best Paper Candidate[BibTeX][Link][Abstract]@inproceedings { DBLP:conf/dac/ChangCCKLL14, author = {Chang, Yu-Ming and Chang, Yuan-Hao and Chen, Jian-Jia and Kuo, Tei-Wei and Li, Hsiang-Pang and Lue, Hang-Ting}, title = {On Trading Wear-leveling with Heal-leveling}, booktitle = {Design Automation Conference (DAC), San Francisco, CA, USA, June 1-5}, year = {2014}, note = {Best Paper Candidate}, url = {http://doi.acm.org/10.1145/2593069.2593172}, confidential = {n}, abstract = {Manufacturers are constantly seeking to increase flash memory density in order to fulfill the ever growing demand for storage capacity. However, this trend significantly reduces the reliability and endurance of flash memory chips. The lifetime degradation worsens as the number of erase cycles grows, even with wear leveling technology being adopted to extend flash memory lifetime by evenly distributing erase cycles to every flash block. To address this issue, self-healing technology is proposed to recover a flash block before the flash block is worn out, but such a technology still has its limitation when recovering flash blocks. In contrast to the existing wear leveling designs, we adopt the self-healing technology to propose a heal-leveling design that evenly distributes healing cycles to flash blocks. Ultimately, heal-leveling aims to extend the lifetime of flash memory without introducing a large amount of live-data copying overheads. We conducted a series of experiments to evaluate the capability of the proposed design. The results show that our design can significantly improve the access performance and the effective lifetime of flash memory without the unnecessary overheads caused by wear leveling technology.}, }Manufacturers are constantly seeking to increase flash memory density in order to fulfill the ever growing demand for storage capacity. However, this trend significantly reduces the reliability and endurance of flash memory chips. The lifetime degradation worsens as the number of erase cycles grows, even with wear leveling technology being adopted to extend flash memory lifetime by evenly distributing erase cycles to every flash block. To address this issue, self-healing technology is proposed to recover a flash block before the flash block is worn out, but such a technology still has its limitation when recovering flash blocks. In contrast to the existing wear leveling designs, we adopt the self-healing technology to propose a heal-leveling design that evenly distributes healing cycles to flash blocks. Ultimately, heal-leveling aims to extend the lifetime of flash memory without introducing a large amount of live-data copying overheads. We conducted a series of experiments to evaluate the capability of the proposed design. The results show that our design can significantly improve the access performance and the effective lifetime of flash memory without the unnecessary overheads caused by wear leveling technology. Jian-Jia Chen, Mong-Jen Kao, D. T. Lee, Ignaz Rutter and Dorothea Wagner.Online Dynamic Power Management with Hard Real-Time Guarantees. In 31st International Symposium on Theoretical Aspects of Computer Science (STACS), Lyon, France, March 5-8, 2014, pages 226-238 2014[BibTeX][Link][Abstract]@inproceedings { DBLP:conf/stacs/ChenKLRW14, author = {Chen, Jian-Jia and Kao, Mong-Jen and Lee, D. T. and Rutter, Ignaz and Wagner, Dorothea}, title = {Online Dynamic Power Management with Hard Real-Time Guarantees}, booktitle = {31st International Symposium on Theoretical Aspects of Computer Science (STACS), Lyon, France, March 5-8, 2014}, year = {2014}, pages = {226-238}, url = {http://dx.doi.org/10.4230/LIPIcs.STACS.2014.226}, confidential = {n}, abstract = {We consider the problem of online dynamic power management that provides hard real-time guarantees for multi-processor systems. In this problem, a set of jobs, each associated with an arrival time, a deadline, and an execution time, arrives to the system in an online fashion. The objective is to compute a non-migrative preemptive schedule of the jobs and a sequence of power on/off operations of the processors so as to minimize the total energy consumption while ensuring that all the deadlines of the jobs are met. We assume that we can use as many processors as necessary. In this paper we examine the complexity of this problem and provide online strategies that lead to practical energy-efficient solutions for real-time multi-processor systems. First, we consider the case for which we know in advance that the set of jobs can be scheduled feasibly on a single processor. We show that, even in this case, the competitive factor of any online algorithm is at least 2.06. On the other hand, we give a 4-competitive online algorithm that uses at most two processors. For jobs with unit execution times, the competitive factor of this algorithm improves to 3.59. Second, we relax our assumption by considering as input multiple streams of jobs, each of which can be scheduled feasibly on a single processor. We present a trade-off between the energy-efficiency of the schedule and the number of processors to be used. More specifically, for k given job streams and h processors with h>k, we give a scheduling strategy such that the energy usage is at most 4.k/(h-k) times that used by any schedule which schedules each of the k streams on a separate processor. Finally, we drop the assumptions on the input set of jobs. We show that the competitive factor of any online algorithm is at least 2.28, even for the case of unit job execution times for which we further derive an O(1)-competitive algorithm.}, }We consider the problem of online dynamic power management that provides hard real-time guarantees for multi-processor systems. In this problem, a set of jobs, each associated with an arrival time, a deadline, and an execution time, arrives to the system in an online fashion. The objective is to compute a non-migrative preemptive schedule of the jobs and a sequence of power on/off operations of the processors so as to minimize the total energy consumption while ensuring that all the deadlines of the jobs are met. We assume that we can use as many processors as necessary. In this paper we examine the complexity of this problem and provide online strategies that lead to practical energy-efficient solutions for real-time multi-processor systems. First, we consider the case for which we know in advance that the set of jobs can be scheduled feasibly on a single processor. We show that, even in this case, the competitive factor of any online algorithm is at least 2.06. On the other hand, we give a 4-competitive online algorithm that uses at most two processors. For jobs with unit execution times, the competitive factor of this algorithm improves to 3.59. Second, we relax our assumption by considering as input multiple streams of jobs, each of which can be scheduled feasibly on a single processor. We present a trade-off between the energy-efficiency of the schedule and the number of processors to be used. More specifically, for k given job streams and h processors with h>k, we give a scheduling strategy such that the energy usage is at most 4.k/(h-k) times that used by any schedule which schedules each of the k streams on a separate processor. Finally, we drop the assumptions on the input set of jobs. We show that the competitive factor of any online algorithm is at least 2.28, even for the case of unit job execution times for which we further derive an O(1)-competitive algorithm. Helena Kotthaus, Michel Lang, Jörg Rahnenführer and Peter Marwedel.Runtime and Memory Consumption Analyses for Machine Learning R Programs. In Abstracts 45. Arbeitstagung, Ulmer Informatik-Berichte, pages 3-4 June 2013[BibTeX][PDF]@inproceedings { kotthaus/2013a, author = {Kotthaus, Helena and Lang, Michel and Rahnenf{\"u}hrer, J{\"o}rg and Marwedel, Peter}, title = {Runtime and Memory Consumption Analyses for Machine Learning R Programs}, booktitle = {Abstracts 45. Arbeitstagung, Ulmer Informatik-Berichte}, year = {2013}, pages = {3-4}, month = {jun}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/kotthaus_etal_2013a.pdf}, confidential = {n}, } Björn Döbel, Horst Schirmeier and Michael Engel.Investigating the Limitations of PVF for Realistic Program Vulnerability Assessment . In Proceedings of the 5th Workshop on Design for Reliability (DFR) January 2013, - Best Poster Award -[BibTeX][PDF][Abstract]@inproceedings { doebel:2013:dfr, author = {D\"obel, Bj\"orn and Schirmeier, Horst and Engel, Michael}, title = {Investigating the Limitations of PVF for Realistic Program Vulnerability Assessment }, booktitle = {Proceedings of the 5th Workshop on Design for Reliability (DFR)}, year = {2013}, month = {January}, note = {- Best Poster Award -}, keywords = {ders}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2013-dfr-doebel.pdf}, confidential = {n}, abstract = {From a software developer's perspective, fault injection (FI) is the most complete way of evaluating the sensitivity of a program against hardware errors. Unfortunately, FI campaigns require a substantial investment of both, time and computing resources, making their application infeasible in many cases. Program Vulnerability Factor (PVF) analysis has been proposed as an alternative for estimating software vulnerability. In this paper we present PVF/x86, a tool for computing the PVF for x86 programs. We validate the use of PVF analysis by running PVF/x86 on an image decoder application and compare the results to those obtained with a state-of-the-art FI framework. We identify weak spots of PVF analysis and outline ideas for addressing those points.}, }From a software developer's perspective, fault injection (FI) is the most complete way of evaluating the sensitivity of a program against hardware errors. Unfortunately, FI campaigns require a substantial investment of both, time and computing resources, making their application infeasible in many cases. Program Vulnerability Factor (PVF) analysis has been proposed as an alternative for estimating software vulnerability. In this paper we present PVF/x86, a tool for computing the PVF for x86 programs. We validate the use of PVF analysis by running PVF/x86 on an image decoder application and compare the results to those obtained with a state-of-the-art FI framework. We identify weak spots of PVF analysis and outline ideas for addressing those points. Daniel Cordes, Michael Engel, Olaf Neugebauer and Peter Marwedel.Automatic Extraction of Task-Level Parallelism for Heterogeneous MPSoCs. In Proceedings of the Fourth International Workshop on Parallel Software Tools and Tool Infrastructures (PSTI 2013)Lyon, France, October 2013[BibTeX][PDF][Abstract]@inproceedings { Cordes:2013:PSTI, author = {Cordes, Daniel and Engel, Michael and Neugebauer, Olaf and Marwedel, Peter}, title = {Automatic Extraction of Task-Level Parallelism for Heterogeneous MPSoCs}, booktitle = {Proceedings of the Fourth International Workshop on Parallel Software Tools and Tool Infrastructures (PSTI 2013)}, year = {2013}, series = {PSTI 2013}, address = {Lyon, France}, month = {oct}, keywords = {automatic parallelization; embedded software; heterogeneity; mpsoc; integer linear programming; task-level parallelism}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2013-psti-cordes.pdf}, confidential = {n}, abstract = {Heterogeneous multi-core platforms are increasingly attractive for embedded applications due to their adaptability and efficiency. This proliferation of heterogeneity demands new approaches for extracting thread level parallelism from sequential applications which have to be efficient at runtime. We present, to the best of our knowledge, the first Integer Linear Programming (ILP)-based parallelization approach for heterogeneous multi-core platforms. Using Hierarchical Task Graphs and high-level timing models, our approach manages to balance the extracted tasks while considering performance differences between cores. As a result, we obtain considerable speedups at runtime, significantly outperforming tools for homogeneous systems. We evaluate our approach by parallelizing standard benchmarks from various application domains.}, }Heterogeneous multi-core platforms are increasingly attractive for embedded applications due to their adaptability and efficiency. This proliferation of heterogeneity demands new approaches for extracting thread level parallelism from sequential applications which have to be efficient at runtime. We present, to the best of our knowledge, the first Integer Linear Programming (ILP)-based parallelization approach for heterogeneous multi-core platforms. Using Hierarchical Task Graphs and high-level timing models, our approach manages to balance the extracted tasks while considering performance differences between cores. As a result, we obtain considerable speedups at runtime, significantly outperforming tools for homogeneous systems. We evaluate our approach by parallelizing standard benchmarks from various application domains. Timon Kelter, Tim Harde, Peter Marwedel and Heiko Falk.Evaluation of resource arbitration methods for multi-core real-time systems. In Proceedings of the 13th International Workshop on Worst-Case Execution Time Analysis (WCET)Paris, France, July 2013[BibTeX][PDF][Link][Abstract]@inproceedings { kelter:2013:wcet, author = {Kelter, Timon and Harde, Tim and Marwedel, Peter and Falk, Heiko}, title = {Evaluation of resource arbitration methods for multi-core real-time systems}, booktitle = {Proceedings of the 13th International Workshop on Worst-Case Execution Time Analysis (WCET)}, year = {2013}, editor = {Claire Maiza}, address = {Paris, France}, month = {July}, url = {http://wcet2013.imag.fr/}, keywords = {wcet}, file = {http://drops.dagstuhl.de/opus/volltexte/2013/4117/pdf/2.pdf}, confidential = {n}, abstract = {Multi-core systems have become prevalent in the last years, because of their favorable properties in terms of energy consumption, computing power and design complexity. First attempts have been made to devise WCET analyses for multi-core processors, which have to deal with the problem that the cores may experience interferences during accesses to shared resources. To limit these interferences, the vast amount of previous work is proposing a strict TDMA (time division multiple access) schedule for arbitrating shared resources. Though this type of arbitration yields a high predictability, this advantage is paid for with a poor resource utilization. In this work, we compare different arbitration methods with respect to their predictability and average case performance. We show how known WCET analysis techniques can be extended to work with the presented arbitration strategies and perform an evaluation of the resulting ACETs and WCETs on an extensive set of realworld benchmarks. Results show that there are cases when TDMA is not the best strategy, especially when predictability and performance are equally important.}, }Multi-core systems have become prevalent in the last years, because of their favorable properties in terms of energy consumption, computing power and design complexity. First attempts have been made to devise WCET analyses for multi-core processors, which have to deal with the problem that the cores may experience interferences during accesses to shared resources. To limit these interferences, the vast amount of previous work is proposing a strict TDMA (time division multiple access) schedule for arbitrating shared resources. Though this type of arbitration yields a high predictability, this advantage is paid for with a poor resource utilization. In this work, we compare different arbitration methods with respect to their predictability and average case performance. We show how known WCET analysis techniques can be extended to work with the presented arbitration strategies and perform an evaluation of the resulting ACETs and WCETs on an extensive set of realworld benchmarks. Results show that there are cases when TDMA is not the best strategy, especially when predictability and performance are equally important. Daniel Cordes, Michael Engel, Olaf Neugebauer and Peter Marwedel.Automatic Extraction of Pipeline Parallelism for Embedded Heterogeneous Multi-Core Platforms. In Proceedings of the Sixteenth International Conference on Compilers, Architectures, and Synthesis for Embedded Systems (CASES 2013)Montreal, Canada, October 2013[BibTeX][PDF][Abstract]@inproceedings { Cordes:2013:CASES, author = {Cordes, Daniel and Engel, Michael and Neugebauer, Olaf and Marwedel, Peter}, title = {Automatic Extraction of Pipeline Parallelism for Embedded Heterogeneous Multi-Core Platforms}, booktitle = {Proceedings of the Sixteenth International Conference on Compilers, Architectures, and Synthesis for Embedded Systems (CASES 2013)}, year = {2013}, series = {CASES 2013}, address = {Montreal, Canada}, month = {oct}, keywords = {Automatic Parallelization; Heterogeneity; MPSoC; Embedded Software; Integer Linear Programming; Pipeline}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2013-cases-cordes.pdf}, confidential = {n}, abstract = {Automatic parallelization of sequential applications is the key for efficient use and optimization of current and future embedded multi-core systems. However, existing approaches often fail to achieve efficient balancing of tasks running on heterogeneous cores of an MPSoC. A reason for this is often insufficient knowledge of the underlying architecture's performance. In this paper, we present a novel parallelization approach for embedded MPSoCs that combines pipeline parallelization for loops with knowledge about different execution times for tasks on cores with different performance properties. Using Integer Linear Programming, an optimal solution with respect to the model used is derived implementing tasks with a well-balanced execution behavior. We evaluate our pipeline parallelization approach for heterogeneous MPSoCs using a set of standard embedded benchmarks and compare it with two existing state-of-the-art approaches. For all benchmarks, our parallelization approach obtains significantly higher speedups than either approach on heterogeneous MPSoCs. }, }Automatic parallelization of sequential applications is the key for efficient use and optimization of current and future embedded multi-core systems. However, existing approaches often fail to achieve efficient balancing of tasks running on heterogeneous cores of an MPSoC. A reason for this is often insufficient knowledge of the underlying architecture's performance. In this paper, we present a novel parallelization approach for embedded MPSoCs that combines pipeline parallelization for loops with knowledge about different execution times for tasks on cores with different performance properties. Using Integer Linear Programming, an optimal solution with respect to the model used is derived implementing tasks with a well-balanced execution behavior. We evaluate our pipeline parallelization approach for heterogeneous MPSoCs using a set of standard embedded benchmarks and compare it with two existing state-of-the-art approaches. For all benchmarks, our parallelization approach obtains significantly higher speedups than either approach on heterogeneous MPSoCs. Andreas Heinig, Ingo Korb, Florian Schmoll, Peter Marwedel and Michael Engel.Fast and Low-Cost Instruction-Aware Fault Injection. In Proc. of SOBRES 2013 2013[BibTeX][Link][Abstract]@inproceedings { heinig:2013:sobres, author = {Heinig, Andreas and Korb, Ingo and Schmoll, Florian and Marwedel, Peter and Engel, Michael}, title = {Fast and Low-Cost Instruction-Aware Fault Injection}, booktitle = {Proc. of SOBRES 2013}, year = {2013}, url = {http://danceos.org/sobres/2013/papers/SOBRES-640-Heinig.pdf}, keywords = {ders}, confidential = {n}, abstract = {In order to assess the robustness of software-based fault-tolerance methods, extensive tests have to be performed that inject faults, such as bit flips, into hardware components of a running system. Fault injection commonly uses either system simulations, resulting in execution times orders of magnitude longer than on real systems, or exposes a real system to error sources like radiation. This can take place in real time, but it enables only a very coarse-grained control over the affected system component. A solution combining the best characteristics from both approaches should achieve precise fault injection in real hardware systems. The approach presented in this paper uses the JTAG background debug facility of a CPU to inject faults into main memory and registers of a running system. Compared to similar earlier approaches, our solution is able to achieve rapid fault injection using a low-cost microcontroller instead of a complex FPGA. Consequently, our injection software is much more flexible. It allows to restrict error injection to the execution of a set of predefined components, resulting in a more precise control of the injection, and also emulates error reporting, which enables the evaluation of different error detection approaches in addition to robustness evaluation.}, }In order to assess the robustness of software-based fault-tolerance methods, extensive tests have to be performed that inject faults, such as bit flips, into hardware components of a running system. Fault injection commonly uses either system simulations, resulting in execution times orders of magnitude longer than on real systems, or exposes a real system to error sources like radiation. This can take place in real time, but it enables only a very coarse-grained control over the affected system component. A solution combining the best characteristics from both approaches should achieve precise fault injection in real hardware systems. The approach presented in this paper uses the JTAG background debug facility of a CPU to inject faults into main memory and registers of a running system. Compared to similar earlier approaches, our solution is able to achieve rapid fault injection using a low-cost microcontroller instead of a complex FPGA. Consequently, our injection software is much more flexible. It allows to restrict error injection to the execution of a set of predefined components, resulting in a more precise control of the injection, and also emulates error reporting, which enables the evaluation of different error detection approaches in addition to robustness evaluation. Daniel Cordes, Michael Engel, Olaf Neugebauer and Peter Marwedel.Automatic Extraction of Multi-Objective Aware Parallelism for Heterogeneous MPSoCs. In Proceedings of the Sixth International Workshop on Multi-/Many-core Computing Systems (MuCoCoS 2013)Edinburgh, Scotland, UK, September 2013[BibTeX][PDF][Abstract]@inproceedings { Cordes:2013:MUCOCOS, author = {Cordes, Daniel and Engel, Michael and Neugebauer, Olaf and Marwedel, Peter}, title = {Automatic Extraction of Multi-Objective Aware Parallelism for Heterogeneous MPSoCs}, booktitle = {Proceedings of the Sixth International Workshop on Multi-/Many-core Computing Systems (MuCoCoS 2013)}, year = {2013}, series = {MuCoCoS 2013}, address = {Edinburgh, Scotland, UK}, month = {sep}, keywords = {automatic parallelization; embedded software; heterogeneity; mpsoc; genetic algorithms; task-level parallelism; pipeline parallelism; multi-objective}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2013-mucocos-cordes.pdf}, confidential = {n}, abstract = {Heterogeneous MPSoCs are used in a large fraction of current embedded systems. In order to efficiently exploit the available processing power, advanced parallelization techniques are required. In addition to consider performance variances between heterogeneous cores, these methods have to be multi-objective aware to be useful for resource restricted embedded systems. This multi-objective optimization requirement results in an explosion of the design space size. As a consequence, efficient approaches are required to find promising solution candidates. In this paper, we present the first portable genetic algorithm-based approach to speed up ANSI-C applications by combining extraction techniques for task-level and pipeline parallelism for heterogeneous multicores while considering additional objectives. Using our approach enables embedded system designers to select a parallelization of an application from a set of Pareto-optimal solutions according to the performance and energy consumption requirements of a given system. The evaluation of a large set of typical embedded benchmarks shows that our approach is able to generate solutions with low energy consumption, high speedup, low communication overhead or useful trade-offs between these three objectives.}, }Heterogeneous MPSoCs are used in a large fraction of current embedded systems. In order to efficiently exploit the available processing power, advanced parallelization techniques are required. In addition to consider performance variances between heterogeneous cores, these methods have to be multi-objective aware to be useful for resource restricted embedded systems. This multi-objective optimization requirement results in an explosion of the design space size. As a consequence, efficient approaches are required to find promising solution candidates. In this paper, we present the first portable genetic algorithm-based approach to speed up ANSI-C applications by combining extraction techniques for task-level and pipeline parallelism for heterogeneous multicores while considering additional objectives. Using our approach enables embedded system designers to select a parallelization of an application from a set of Pareto-optimal solutions according to the performance and energy consumption requirements of a given system. The evaluation of a large set of typical embedded benchmarks shows that our approach is able to generate solutions with low energy consumption, high speedup, low communication overhead or useful trade-offs between these three objectives. Jan Kleinsorge, Heiko Falk and Peter Marwedel.Simple Analysis of Partial Worst-case Execution Paths on General Control Flow Graphs. In Proceedings of the International Conference on Embedded Software (EMSOFT 2013)Montreal, Canada, October 2013[BibTeX][Link]@inproceedings { Kleinsorge:2013:EMSOFT, author = {Kleinsorge, Jan and Falk, Heiko and Marwedel, Peter}, title = {Simple Analysis of Partial Worst-case Execution Paths on General Control Flow Graphs}, booktitle = {Proceedings of the International Conference on Embedded Software (EMSOFT 2013)}, year = {2013}, series = {EMSOFT 2013}, address = {Montreal, Canada}, month = {oct}, url = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2013_emsoft.pdf}, keywords = {wcet; Worst-case Execution Time; Path Analysis; Static Analysis}, confidential = {n}, } A. Herkersdorf, M. Engel, M. Glaß, J. Henkel, V.B. Kleeberger, M.A. Kochte, J.M. Kühn, S.R. Nassif, H. Rauchfuss, W. Rosenstiel, U. Schlichtmann, M. Shafique, M.B. Tahoori, J. Teich, N. Wehn, C. Weis and H.-J. Wunderlich.Cross-Layer Dependability Modeling and Abstraction in Systems on Chip. In Proceedings of the Workshop on Silicon Errors in Logic System Effects (SELSE) March 2013[BibTeX][PDF][Abstract]@inproceedings { herkersdorf:2013:selse, author = {Herkersdorf, A. and Engel, M. and Gla\"s, M. and Henkel, J. and Kleeberger, V.B. and Kochte, M.A. and K\"uhn, J.M. and Nassif, S.R. and Rauchfuss, H. and Rosenstiel, W. and Schlichtmann, U. and Shafique, M. and Tahoori, M.B. and Teich, J. and Wehn, N. and Weis, C. and Wunderlich, H.-J.}, title = {Cross-Layer Dependability Modeling and Abstraction in Systems on Chip}, booktitle = {Proceedings of the Workshop on Silicon Errors in Logic System Effects (SELSE)}, year = {2013}, month = {March}, keywords = {ders}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2013-selse-herkersdorf.pdf}, confidential = {n}, abstract = {The Resilience Articulation Point (RAP) model aims at provisioning researchers and developers with a probabilistic fault abstraction and error propagation framework covering all hardware/software layers of a System on Chip. RAP assumes that physically induced faults at the technology or CMOS device layer will eventually manifest themselves as a single or multiple bit ﬂip(s). When probabilistic error functions for speciﬁc fault origins are known at the bit or signal level, knowledge about the unit of design and its environment allow the transformation of the bit-related error functions into characteristic higher layer representations, such as error functions for data words, Finite State Machine (FSM) state, macro interfaces or software variables. Thus, design concerns at higher abstraction layers can be investigated without the necessity to further consider the full details of lower levels of design. This paper introduces the ideas of RAP based on examples of radiation induced soft errors in SRAM cells and sequential CMOS logic. It shows by example how probabilistic bit ﬂips are systematically abstracted and propagated towards higher abstraction levels up to the application software layer, and how RAP can be used to parameterize architecture level resilience methods.}, }The Resilience Articulation Point (RAP) model aims at provisioning researchers and developers with a probabilistic fault abstraction and error propagation framework covering all hardware/software layers of a System on Chip. RAP assumes that physically induced faults at the technology or CMOS device layer will eventually manifest themselves as a single or multiple bit ﬂip(s). When probabilistic error functions for speciﬁc fault origins are known at the bit or signal level, knowledge about the unit of design and its environment allow the transformation of the bit-related error functions into characteristic higher layer representations, such as error functions for data words, Finite State Machine (FSM) state, macro interfaces or software variables. Thus, design concerns at higher abstraction layers can be investigated without the necessity to further consider the full details of lower levels of design. This paper introduces the ideas of RAP based on examples of radiation induced soft errors in SRAM cells and sequential CMOS logic. It shows by example how probabilistic bit ﬂips are systematically abstracted and propagated towards higher abstraction levels up to the application software layer, and how RAP can be used to parameterize architecture level resilience methods. Pascal Libuschewski, Dominic Siedhoff, Constantin Timm, Andrej Gelenberg and Frank Weichert.Fuzzy-enhanced, Real-time capable Detection of Biological Viruses Using a Portable Biosensor. In In Proceedings of the International Joint Conference on Biomedical Engineering Systems and Technologies (BIOSIGNALS), pages 169-174 February 2013[BibTeX][Abstract]@inproceedings { Libuschewski/etal/2013b, author = {Libuschewski, Pascal and Siedhoff, Dominic and Timm, Constantin and Gelenberg, Andrej and Weichert, Frank}, title = {Fuzzy-enhanced, Real-time capable Detection of Biological Viruses Using a Portable Biosensor}, booktitle = {In Proceedings of the International Joint Conference on Biomedical Engineering Systems and Technologies (BIOSIGNALS)}, year = {2013}, pages = {169-174}, month = {February}, confidential = {n}, abstract = {This work presents a novel portable biosensor for indirect detection of viruses by optical microscopy. The focus lies on energy-efficient real-time data analysis for automated virus detection. The superiority of our fuzzy-enhanced time-series analysis over hard thresholding is demonstrated. Real-time capability is achieved through general-purpose computing on graphics processing units (GPGPU). It is shown that this virus detection is real-time capable on an off-the-shelf laptop computer, allowing for a wide range of in-field use-cases.}, }This work presents a novel portable biosensor for indirect detection of viruses by optical microscopy. The focus lies on energy-efficient real-time data analysis for automated virus detection. The superiority of our fuzzy-enhanced time-series analysis over hard thresholding is demonstrated. Real-time capability is achieved through general-purpose computing on graphics processing units (GPGPU). It is shown that this virus detection is real-time capable on an off-the-shelf laptop computer, allowing for a wide range of in-field use-cases. Janmartin Jahn, Sebastian Kobbe, Santiago Pagani, Jian{-}Jia Chen and J{\"{o}}rg Henkel.Runtime resource allocation for software pipelines. In International Workshop on Software and Compilers for Embedded Systems, {M-SCOPES} '13, Sankt Goar, Germany, June 19-21, 2013, pages 96--99 2013[BibTeX][Link]@inproceedings { DBLP:conf/scopes/JahnKPCH13, author = {Jahn, Janmartin and Kobbe, Sebastian and Pagani, Santiago and Chen, Jian{-}Jia and Henkel, J{\"{o}}rg}, title = {Runtime resource allocation for software pipelines}, booktitle = {International Workshop on Software and Compilers for Embedded Systems, {M-SCOPES} '13, Sankt Goar, Germany, June 19-21, 2013}, year = {2013}, bdsk-url-1 = {http://doi.acm.org/10.1145/2463596.2486156}, bdsk-url-2 = {http://dx.doi.org/10.1145/2463596.2486156}, pages = {96--99}, url = {http://doi.acm.org/10.1145/2463596.2486156}, confidential = {n}, } Michael Engel.Adding Flexibility to Fault-Tolerance by Analyzing Hardware-Software Interactions. In Invited Talk at HiPEAC Computing Systems WeekGhent, Belgium, October 2012, Thematic Session "The intertwining challenges of reliability, testing and verification"[BibTeX][Abstract]@inproceedings { engel:csw2012, author = {Engel, Michael}, title = {Adding Flexibility to Fault-Tolerance by Analyzing Hardware-Software Interactions}, booktitle = {Invited Talk at HiPEAC Computing Systems Week}, year = {2012}, address = {Ghent, Belgium}, month = {oct}, organization = {HiPEACH}, note = {Thematic Session "The intertwining challenges of reliability, testing and verification"}, confidential = {n}, abstract = {With an expected increasing number of permanent and transient errors and a growing influence of variability on semiconductor operation, correcting all possible errors in hardware will become more and more infeasible. Future fault-tolerant systems will have to incorporate information about possible effects of errors on the application level in order to reduce the hardware and software overhead for fault-tolerance. This requires a reconsideration of the interaction of hardware and software. By extending programming language semantics and performing compiler-based static analyses on error effects and propagation, it becomes possible to introduce reliability requirements into software development as a first-class member, allowing system designers to tailor the fault tolerance behavior of a system to given requirements, like expected uptime or quality-of-service bounds. This talk will give an overview of current research approaches to build these more flexible fault-tolerant systems with a special focus on the projects in Germany's research program SPP1500 "Dependable Embedded Systems".}, }With an expected increasing number of permanent and transient errors and a growing influence of variability on semiconductor operation, correcting all possible errors in hardware will become more and more infeasible. Future fault-tolerant systems will have to incorporate information about possible effects of errors on the application level in order to reduce the hardware and software overhead for fault-tolerance. This requires a reconsideration of the interaction of hardware and software. By extending programming language semantics and performing compiler-based static analyses on error effects and propagation, it becomes possible to introduce reliability requirements into software development as a first-class member, allowing system designers to tailor the fault tolerance behavior of a system to given requirements, like expected uptime or quality-of-service bounds. This talk will give an overview of current research approaches to build these more flexible fault-tolerant systems with a special focus on the projects in Germany's research program SPP1500 "Dependable Embedded Systems". Daniel Cordes and Peter Marwedel.Multi-Objective Aware Extraction of Task-Level Parallelism Using Genetic Algorithms. In Proceedings of Design, Automation and Test in Europe (DATE 2012)Dresden, Germany, March 2012[BibTeX][PDF][Abstract]@inproceedings { cordes:12:date, author = {Cordes, Daniel and Marwedel, Peter}, title = {Multi-Objective Aware Extraction of Task-Level Parallelism Using Genetic Algorithms}, booktitle = {Proceedings of Design, Automation and Test in Europe (DATE 2012)}, year = {2012}, address = {Dresden, Germany}, month = {mar}, keywords = {Automatic Parallelization, Embedded Software, Multi-Objective, Genetic Algorithms, Task-Level Parallelism, Energy awareness}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2012-date-cordes.pdf}, confidential = {n}, abstract = {A large amount of research work has been done in the area of automatic parallelization for decades, resulting in a huge amount of tools, which should relieve the designer from the burden of manually parallelizing an application. Unfortunately, most of these tools are only optimizing the execution time by splitting up applications into concurrently executed tasks. In the domain of embedded devices, however, it is not sufficient to look only at this criterion. Since most of these devices are constraint-driven regarding execution time, energy consumption, heat dissipation and other objectives, a good trade-off has to be found to efficiently map applications to multiprocessor system on chip (MPSoC) devices. Therefore, we developed a fully automated multi-objective aware parallelization framework, which optimizes different objectives at the same time. The tool returns a Pareto-optimal front of solutions of the parallelized application to the designer, so that the solution with the best trade-off can be chosen.}, }A large amount of research work has been done in the area of automatic parallelization for decades, resulting in a huge amount of tools, which should relieve the designer from the burden of manually parallelizing an application. Unfortunately, most of these tools are only optimizing the execution time by splitting up applications into concurrently executed tasks. In the domain of embedded devices, however, it is not sufficient to look only at this criterion. Since most of these devices are constraint-driven regarding execution time, energy consumption, heat dissipation and other objectives, a good trade-off has to be found to efficiently map applications to multiprocessor system on chip (MPSoC) devices. Therefore, we developed a fully automated multi-objective aware parallelization framework, which optimizes different objectives at the same time. The tool returns a Pareto-optimal front of solutions of the parallelized application to the designer, so that the solution with the best trade-off can be chosen. Michael Engel and Peter Marwedel.Semantic Gaps in Software-Based Reliability. In Proceedings of the 4th Workshop on Design for Reliability (DFR'12)Paris, France, January 2012[BibTeX][Abstract]@inproceedings { engel:dfr:2012, author = {Engel, Michael and Marwedel, Peter}, title = {Semantic Gaps in Software-Based Reliability}, booktitle = {Proceedings of the 4th Workshop on Design for Reliability (DFR'12)}, year = {2012}, address = {Paris, France}, month = {jan}, organization = {HiPEAC}, keywords = {ders}, confidential = {n}, abstract = {Future semiconductors will show a heterogeneous distribution of permanent faults as a result of fabrication variations and aging. To increase yields and lifetimes of these chips, a fault tolerance approach is required that handles resources on a small-scale basis with low overhead. In embedded systems, this overhead can be reduced by classifying data and instructions to determine the varying impact of errors on different instructions and data. Using this classification, only errors with significant impact on system behavior have to be corrected. In this position paper, we describe one problem with this analysis, the semantic gap between high-level language source code and the low-level data flow through architecture components. In addition, we discuss possible approaches to handle this gap. Of special interest are the implications on achieving reliable execution of dependability-critical code. }, }Future semiconductors will show a heterogeneous distribution of permanent faults as a result of fabrication variations and aging. To increase yields and lifetimes of these chips, a fault tolerance approach is required that handles resources on a small-scale basis with low overhead. In embedded systems, this overhead can be reduced by classifying data and instructions to determine the varying impact of errors on different instructions and data. Using this classification, only errors with significant impact on system behavior have to be corrected. In this position paper, we describe one problem with this analysis, the semantic gap between high-level language source code and the low-level data flow through architecture components. In addition, we discuss possible approaches to handle this gap. Of special interest are the implications on achieving reliable execution of dependability-critical code. Olivera Jovanovic, Nils Kneuper, Peter Marwedel and Michael Engel.ILP-based Memory-Aware Mapping Optimization for MPSoCs. In The 10th IEEE/IFIP International Conference on Embedded and Ubiquitous ComputingPaphos, Cyprus, December 2012[BibTeX][PDF][Abstract]@inproceedings { jovanovic:2012b, author = {Jovanovic, Olivera and Kneuper, Nils and Marwedel, Peter and Engel, Michael}, title = {ILP-based Memory-Aware Mapping Optimization for MPSoCs}, booktitle = {The 10th IEEE/IFIP International Conference on Embedded and Ubiquitous Computing}, year = {2012}, address = {Paphos, Cyprus}, month = {December}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2012-cse-jovanovic.pdf}, confidential = {n}, abstract = {The mapping of applications onto multiprocessor system-on-chip (MPSoC) devices is an important and complex optimization task. The goal is to efficiently distribute application tasks to available processors while optimizing for energy or runtime. Unfortunately, the influence of memories or memory hierarchies is not considered in existing mapping optimizations so far, even though it is a well-known fact that memories have a drastic impact on runtime and energy consumption of the system. In this paper, we address the challenge of finding an efficient application to MPSoC mapping while explicitly considering the underlying memory subsystem and an efficient mapping of task’s memory objects to memories. For this purpose, we developed a memory-aware mapping tool based on ILP optimization. Evaluations on various benchmarks show that our memory-aware mapping tool outperforms state-of-the-art mapping optimizations by reducing the runtime up to 18%, and energy consumption up to 21%.}, }The mapping of applications onto multiprocessor system-on-chip (MPSoC) devices is an important and complex optimization task. The goal is to efficiently distribute application tasks to available processors while optimizing for energy or runtime. Unfortunately, the influence of memories or memory hierarchies is not considered in existing mapping optimizations so far, even though it is a well-known fact that memories have a drastic impact on runtime and energy consumption of the system. In this paper, we address the challenge of finding an efficient application to MPSoC mapping while explicitly considering the underlying memory subsystem and an efficient mapping of task’s memory objects to memories. For this purpose, we developed a memory-aware mapping tool based on ILP optimization. Evaluations on various benchmarks show that our memory-aware mapping tool outperforms state-of-the-art mapping optimizations by reducing the runtime up to 18%, and energy consumption up to 21%. Sascha Plazar, Jan Kleinsorge, Heiko Falk and Peter Marwedel.WCET-aware Static Locking of Instruction Caches. In Proceedings of the International Symposium on Code Generation and Optimization (CGO), pages 44-52San Jose, CA, USA, April 2012[BibTeX][Link][Abstract]@inproceedings { plazar:2012:cgo, author = {Plazar, Sascha and Kleinsorge, Jan and Falk, Heiko and Marwedel, Peter}, title = {WCET-aware Static Locking of Instruction Caches}, booktitle = {Proceedings of the International Symposium on Code Generation and Optimization (CGO)}, year = {2012}, pages = {44-52}, address = {San Jose, CA, USA}, month = {apr}, url = {http://www.uni-ulm.de/fileadmin/website_uni_ulm/iui.inst.050/profile/profil_hfalk/publications/20120402-cgo-plazar.pdf}, keywords = {wcet}, confidential = {n}, abstract = {In the past decades, embedded system designers moved from simple, predictable system designs towards complex systems equipped with caches. This step was necessary in order to bridge the increasingly growing gap between processor and memory system performance. Static analysis techniques had to be developed to allow the estimation of the cache behavior and an upper bound of the execution time of a program. This bound is called worst-case execution time (WCET). Its knowledge is crucial to verify whether hard real-time systems satisfy their timing constraints, and the WCET is a key parameter for the design of embedded systems. In this paper, we propose a WCET-aware optimization technique for static I-cache locking which improves a program’s performance and predictability. To select the memory blocks to lock into the cache and avoid time consuming repetitive WCET analyses, we developed a new algorithm employing integer-linear programming (ILP). The ILP models the worst-case execution path (WCEP) of a program and takes the influence of locked cache contents into account. By modeling the effect of locked memory blocks on the runtime of basic blocks, the overall WCET of a program can be minimized. We show that our optimization is able to reduce the WCET of real-life benchmarks by up to 40.8%. At the same time, our proposed approach is able to outperform a regular cache by up to 23.8% in terms of WCET.}, }In the past decades, embedded system designers moved from simple, predictable system designs towards complex systems equipped with caches. This step was necessary in order to bridge the increasingly growing gap between processor and memory system performance. Static analysis techniques had to be developed to allow the estimation of the cache behavior and an upper bound of the execution time of a program. This bound is called worst-case execution time (WCET). Its knowledge is crucial to verify whether hard real-time systems satisfy their timing constraints, and the WCET is a key parameter for the design of embedded systems. In this paper, we propose a WCET-aware optimization technique for static I-cache locking which improves a program’s performance and predictability. To select the memory blocks to lock into the cache and avoid time consuming repetitive WCET analyses, we developed a new algorithm employing integer-linear programming (ILP). The ILP models the worst-case execution path (WCEP) of a program and takes the influence of locked cache contents into account. By modeling the effect of locked memory blocks on the runtime of basic blocks, the overall WCET of a program can be minimized. We show that our optimization is able to reduce the WCET of real-life benchmarks by up to 40.8%. At the same time, our proposed approach is able to outperform a regular cache by up to 23.8% in terms of WCET. Andreas Heinig, Vincent J. Mooney, Florian Schmoll, Peter Marwedel, Krishna Palem and Michael Engel.Classification-based Improvement of Application Robustness and Quality of Service in Probabilistic Computer Systems. In Proceedings of ARCS 2012 - International Conference on Architecture of Computing SystemsMunich, Germany, March 2012, -- ARCS 2012 Best Paper Award Winner --[BibTeX][PDF][Abstract]@inproceedings { heinig:2012:arcs, author = {Heinig, Andreas and Mooney, Vincent J. and Schmoll, Florian and Marwedel, Peter and Palem, Krishna and Engel, Michael}, title = {Classification-based Improvement of Application Robustness and Quality of Service in Probabilistic Computer Systems}, booktitle = {Proceedings of ARCS 2012 - International Conference on Architecture of Computing Systems}, year = {2012}, address = {Munich, Germany}, month = {mar}, note = {-- ARCS 2012 Best Paper Award Winner --}, keywords = {ders}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2012-arcs-heinig.pdf}, confidential = {n}, abstract = {Future semiconductors no longer guarantee permanent de- terministic operation. They are expected to show probabilistic behavior due to lowered voltages and shrinking structures. Compared to radiation-induced errors, probabilistic systems face increa- sed error frequencies leading to unexpected bit-flips. Approaches like probabilistic CMOS provide methods to control error distributions which reduce the error probability in more significant bits. However, instruc- tions handling control flow or pointers still require determinism, requiring a classification to identify these instructions. We apply our transient error classification to probabilistic circuits using differing voltage distributions. Static analysis ensures that probabilistic effects only affect unreliable operations which accept a certain level of impreciseness, and that errors in probabilistic components will never propagate to critical operations. To evaluate, we analyze robustness and quality-of-service of an H.264 video decoder. Using classification results, we map unreliable arithmetic operations onto probabilistic components of an MPARM model, while remaining operations use deterministic components.}, }Future semiconductors no longer guarantee permanent de- terministic operation. They are expected to show probabilistic behavior due to lowered voltages and shrinking structures. Compared to radiation-induced errors, probabilistic systems face increa- sed error frequencies leading to unexpected bit-flips. Approaches like probabilistic CMOS provide methods to control error distributions which reduce the error probability in more significant bits. However, instruc- tions handling control flow or pointers still require determinism, requiring a classification to identify these instructions. We apply our transient error classification to probabilistic circuits using differing voltage distributions. Static analysis ensures that probabilistic effects only affect unreliable operations which accept a certain level of impreciseness, and that errors in probabilistic components will never propagate to critical operations. To evaluate, we analyze robustness and quality-of-service of an H.264 video decoder. Using classification results, we map unreliable arithmetic operations onto probabilistic components of an MPARM model, while remaining operations use deterministic components. Sudipta Chattopadhyay, Chong Lee Kee, Abhik Roychoudhury, Timon Kelter, Heiko Falk and Peter Marwedel.A Unified WCET Analysis Framework for Multi-core Platforms. In IEEE Real-Time and Embedded Technology and Applications Symposium (RTAS), pages 99-108Beijing, China, April 2012[BibTeX][PDF][Link][Abstract]@inproceedings { kelter:2012:rtas, author = {Chattopadhyay, Sudipta and Kee, Chong Lee and Roychoudhury, Abhik and Kelter, Timon and Falk, Heiko and Marwedel, Peter}, title = {A Unified WCET Analysis Framework for Multi-core Platforms}, booktitle = {IEEE Real-Time and Embedded Technology and Applications Symposium (RTAS)}, year = {2012}, pages = {99-108}, address = {Beijing, China}, month = {April}, url = {http://www.rtas.org/12-home.htm}, keywords = {wcet}, file = {http://www.comp.nus.edu.sg/~sudiptac/papers/mxtiming.pdf}, confidential = {n}, abstract = {With the advent of multi-core architectures, worst case execution time (WCET) analysis has become an increasingly difficult problem. In this paper, we propose a unified WCET analysis framework for multi-core processors featuring both shared cache and shared bus. Compared to other previous works, our work differs by modeling the interaction of shared cache and shared bus with other basic micro-architectural components (e.g. pipeline and branch predictor). In addition, our framework does not assume a timing anomaly free multicore architecture for computing the WCET. A detailed experiment methodology suggests that we can obtain reasonably tight WCET estimates in a wide range of benchmark programs.}, }With the advent of multi-core architectures, worst case execution time (WCET) analysis has become an increasingly difficult problem. In this paper, we propose a unified WCET analysis framework for multi-core processors featuring both shared cache and shared bus. Compared to other previous works, our work differs by modeling the interaction of shared cache and shared bus with other basic micro-architectural components (e.g. pipeline and branch predictor). In addition, our framework does not assume a timing anomaly free multicore architecture for computing the WCET. A detailed experiment methodology suggests that we can obtain reasonably tight WCET estimates in a wide range of benchmark programs. Helena Kotthaus, Sascha Plazar and Peter Marwedel.A JVM-based Compiler Strategy for the R Language. In Abstract Booklet at The 8th International R User Conference (UseR!) WiP, pages 68Nashville, Tennessee, USA, June 2012[BibTeX][PDF]@inproceedings { kotthaus:12:user, author = {Kotthaus, Helena and Plazar, Sascha and Marwedel, Peter}, title = {A JVM-based Compiler Strategy for the R Language}, booktitle = {Abstract Booklet at The 8th International R User Conference (UseR!) WiP}, year = {2012}, pages = {68}, address = {Nashville, Tennessee, USA}, month = {jun}, keywords = {R language, Java, dynamic compiler optimization}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2012-user-kotthaus.pdf}, confidential = {n}, } Daniel Cordes, Michael Engel, Peter Marwedel and Olaf Neugebauer.Automatic extraction of multi-objective aware pipeline parallelism using genetic algorithms. In Proceedings of the eighth IEEE/ACM/IFIP international conference on Hardware/software codesign and system synthesisTampere, Finland, October 2012[BibTeX][PDF][Abstract]@inproceedings { Cordes:2012:CODES, author = {Cordes, Daniel and Engel, Michael and Marwedel, Peter and Neugebauer, Olaf}, title = {Automatic extraction of multi-objective aware pipeline parallelism using genetic algorithms}, booktitle = {Proceedings of the eighth IEEE/ACM/IFIP international conference on Hardware/software codesign and system synthesis}, year = {2012}, series = {CODES+ISSS '12}, address = {Tampere, Finland}, month = {oct}, publisher = {ACM}, keywords = {automatic parallelization, embedded software, energy, genetic algorithms, multi-objective, pipeline parallelism}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2012-codes-cordes.pdf}, confidential = {n}, abstract = {The development of automatic parallelization techniques has been fascinating researchers for decades. This has resulted in a significant amount of tools, which should relieve the designer from the burden of manually parallelizing an application. However, most of these tools only focus on minimizing execution time which drastically reduces their applicability to embedded devices. It is essential to find good trade-offs between different objectives like, e.g., execution time, energy consumption, or communication overhead, if applications should be parallelized for embedded multiprocessor system-on-chip (MPSoC) devices. Another important aspect which has to be taken into account is the streaming-based structure found in many embedded applications such as multimedia and network services. The best way to parallelize these applications is to extract pipeline parallelism. Therefore, this paper presents the first multi-objective aware approach exploiting pipeline parallelism automatically to make it most suitable for resource-restricted embedded devices. We have compared the new pipeline parallelization approach to an existing task-level extraction technique. The evaluation has shown that the new approach extracts very efficient multi-objective aware parallelism. In addition, the two approaches have been combined and it could be shown that both approaches perfectly complement each other.}, }The development of automatic parallelization techniques has been fascinating researchers for decades. This has resulted in a significant amount of tools, which should relieve the designer from the burden of manually parallelizing an application. However, most of these tools only focus on minimizing execution time which drastically reduces their applicability to embedded devices. It is essential to find good trade-offs between different objectives like, e.g., execution time, energy consumption, or communication overhead, if applications should be parallelized for embedded multiprocessor system-on-chip (MPSoC) devices. Another important aspect which has to be taken into account is the streaming-based structure found in many embedded applications such as multimedia and network services. The best way to parallelize these applications is to extract pipeline parallelism. Therefore, this paper presents the first multi-objective aware approach exploiting pipeline parallelism automatically to make it most suitable for resource-restricted embedded devices. We have compared the new pipeline parallelization approach to an existing task-level extraction technique. The evaluation has shown that the new approach extracts very efficient multi-objective aware parallelism. In addition, the two approaches have been combined and it could be shown that both approaches perfectly complement each other. Michael Engel and Björn Döbel.The Reliable Computing Base – A Paradigm for Software-based Reliability. In Proceedings of SOBRES September 2012[BibTeX][PDF][Abstract]@inproceedings { engel:2012:sobres, author = {Engel, Michael and D\"obel, Bj\"orn}, title = {The Reliable Computing Base – A Paradigm for Software-based Reliability}, booktitle = {Proceedings of SOBRES}, year = {2012}, month = {sep}, keywords = {ders}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2012-sobres-engel.pdf}, confidential = {n}, abstract = {For embedded systems, the use of software-based error detection and correction approaches is an attractive means in order to reduce often inconvenient overheads in hardware. To ensure that such a software-based fault-tolerance approach is effective, it must be guaranteed that a certain amount of hardware and software components in a system can be trusted to provide correct service in the presence of errors. In analogy with the Trusted Computing Base (TCB) in security research, we call these components the Reliable Computing Base (RCB). Similar to the TCB, it is also desirable to reduce the size of the RCB, so the overhead in redundant hardware resources can be reduced. In this position paper, we describe approaches for informal as well as formal definitions of the RCB, the related metrics and approaches for RCB minimization. }, }For embedded systems, the use of software-based error detection and correction approaches is an attractive means in order to reduce often inconvenient overheads in hardware. To ensure that such a software-based fault-tolerance approach is effective, it must be guaranteed that a certain amount of hardware and software components in a system can be trusted to provide correct service in the presence of errors. In analogy with the Trusted Computing Base (TCB) in security research, we call these components the Reliable Computing Base (RCB). Similar to the TCB, it is also desirable to reduce the size of the RCB, so the overhead in redundant hardware resources can be reduced. In this position paper, we describe approaches for informal as well as formal definitions of the RCB, the related metrics and approaches for RCB minimization. Björn Döbel, Hermann Härtig and Michael Engel.Operating System Support for Redundant Multithreading. In Proceedings of EMSOFT October 2012[BibTeX][PDF][Abstract]@inproceedings { doebel:2012:EMSOFT, author = {D\"obel, Bj\"orn and H\"artig, Hermann and Engel, Michael}, title = {Operating System Support for Redundant Multithreading}, booktitle = {Proceedings of EMSOFT}, year = {2012}, month = {oct}, keywords = {ders}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2012-emsoft-doebel.pdf}, confidential = {n}, abstract = {In modern commodity operating systems, core functionality is usually designed assuming that the underlying processor hardware always functions correctly. Shrinking hardware feature sizes break this assumption. Existing approaches to cope with these issues either use hardware functionality that is not available in commercial-oﬀ-the-shelf (COTS) systems or poses additional requirements on the software development side, making reuse of existing software hard, if not impossible. In this paper we present Romain, a framework that provides transparent redundant multithreading1 as an operating system service for hardware error detection and recovery. When applied to a standard benchmark suite, Romain requires a maximum runtime overhead of 30 % for triple modular redundancy (while in many cases remaining below 5%). Furthermore, our approach minimizes the complexity added to the operating system for the sake of replication. }, }In modern commodity operating systems, core functionality is usually designed assuming that the underlying processor hardware always functions correctly. Shrinking hardware feature sizes break this assumption. Existing approaches to cope with these issues either use hardware functionality that is not available in commercial-oﬀ-the-shelf (COTS) systems or poses additional requirements on the software development side, making reuse of existing software hard, if not impossible. In this paper we present Romain, a framework that provides transparent redundant multithreading1 as an operating system service for hardware error detection and recovery. When applied to a standard benchmark suite, Romain requires a maximum runtime overhead of 30 % for triple modular redundancy (while in many cases remaining below 5%). Furthermore, our approach minimizes the complexity added to the operating system for the sake of replication. Peter Marwedel and Michael Engel.Efficient Computing in Cyber-Physical Systems. In Proceedings of SAMOS XII July 2012[BibTeX][PDF][Abstract]@inproceedings { marwedel:2012:samos, author = {Marwedel, Peter and Engel, Michael}, title = {Efficient Computing in Cyber-Physical Systems}, booktitle = {Proceedings of SAMOS XII}, year = {2012}, month = {jul}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2012-samos-marwedel.pdf}, confidential = {n}, abstract = {Computing in cyber-physical systems has to be efficient in terms of a number of objectives. In particular, computing has to be execution-time and energy efficient. In this paper, we will consider optimization techniques aiming at efficiency in terms of these two objectives. In the first part, we will consider techniques for the integration of compilers and worst-case execution time (WCET) estimation. We will demonstrate, how such integration opens the door to WCET-reduction algorithms. For example, an algorithm for WCET-aware compilation reduces the WCET for an automotive application by more than 50\% by exploiting scratch pad memories (SPMs). In the second part, we will demonstrate techniques for improving the energy efficiency of cyber-physical systems, in particular the use of SPMs. In the third part, we demonstrate how the optimization for multiple objectives taken into account. This paper provides an overview of work performed at the Chair for Embedded Systems of TU Dortmund and the Informatik Centrum Dortmund, Germany.}, }Computing in cyber-physical systems has to be efficient in terms of a number of objectives. In particular, computing has to be execution-time and energy efficient. In this paper, we will consider optimization techniques aiming at efficiency in terms of these two objectives. In the first part, we will consider techniques for the integration of compilers and worst-case execution time (WCET) estimation. We will demonstrate, how such integration opens the door to WCET-reduction algorithms. For example, an algorithm for WCET-aware compilation reduces the WCET for an automotive application by more than 50% by exploiting scratch pad memories (SPMs). In the second part, we will demonstrate techniques for improving the energy efficiency of cyber-physical systems, in particular the use of SPMs. In the third part, we demonstrate how the optimization for multiple objectives taken into account. This paper provides an overview of work performed at the Chair for Embedded Systems of TU Dortmund and the Informatik Centrum Dortmund, Germany. Christopher Boelmann, Torben Weis, Arno Wacker and Michael Engel.Self-Stabilizing Micro Controller for Large-Scale Sensor Networks in Spite of Program Counter Corruptions due to Soft Errors. In Proceedings of ICPADS December 2012[BibTeX][PDF][Abstract]@inproceedings { Boelmann:2012:ICPADS, author = {Boelmann, Christopher and Weis, Torben and Wacker, Arno and Engel, Michael}, title = {Self-Stabilizing Micro Controller for Large-Scale Sensor Networks in Spite of Program Counter Corruptions due to Soft Errors}, booktitle = {Proceedings of ICPADS}, year = {2012}, month = {dec}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2012-icpads-boelmann.pdf}, confidential = {n}, abstract = {For large installations of networked embedded systems it is important that each entity is self-stabilizing, because usually there is nobody to restart nodes that have hung up. Self-stabilization means to recover from temporary failures (soft errors) and adapt to a change of network topology caused by permanent failures. On the software side self-stabilizing algorithms must assume that the hardware is executing the software correctly. In this paper we discuss cases in which soft errors invalidate this assumption, especially in cases where CPU registers or the watchdog timer are affected by the fault. Based on the observation that a guaranteed self-stabilization is only possible as long as the watchdog-timer is working properly after temporary failures, we propose and compare three different approaches that meet the requirements of sensor networks, to solve this problem with a combination of hardware- and software-modifications: 1) A run-time verification of every watchdog access 2) A completely hardware-based approach, without any software modifications 3) A 2X byte code alignment, to realign a corrupted program counter Furthermore we determine the average code-size increase and evaluate necessary hardware-modifications that come along with each approach.}, }For large installations of networked embedded systems it is important that each entity is self-stabilizing, because usually there is nobody to restart nodes that have hung up. Self-stabilization means to recover from temporary failures (soft errors) and adapt to a change of network topology caused by permanent failures. On the software side self-stabilizing algorithms must assume that the hardware is executing the software correctly. In this paper we discuss cases in which soft errors invalidate this assumption, especially in cases where CPU registers or the watchdog timer are affected by the fault. Based on the observation that a guaranteed self-stabilization is only possible as long as the watchdog-timer is working properly after temporary failures, we propose and compare three different approaches that meet the requirements of sensor networks, to solve this problem with a combination of hardware- and software-modifications: 1) A run-time verification of every watchdog access 2) A completely hardware-based approach, without any software modifications 3) A 2X byte code alignment, to realign a corrupted program counter Furthermore we determine the average code-size increase and evaluate necessary hardware-modifications that come along with each approach. Olivera Jovanovic, Peter Marwedel, Iuliana Bacivarov and Lothar Thiele.MAMOT: Memory-Aware Mapping Optimization Tool for MPSoC. In 15th Euromicro Conference on Digital System Design (DSD 2012)Izmir, Turkey, September 2012[BibTeX]@inproceedings { Jovanovic/etal/2012a, author = {Jovanovic, Olivera and Marwedel, Peter and Bacivarov, Iuliana and Thiele, Lothar}, title = {MAMOT: Memory-Aware Mapping Optimization Tool for MPSoC}, booktitle = {15th Euromicro Conference on Digital System Design (DSD 2012)}, year = {2012}, address = {Izmir, Turkey}, month = {September}, confidential = {n}, } Jörg Henkel, Lars Bauer, Joachim Becker, Oliver Bringmann, Uwe Brinkschulte, Samarjit Chakraborty, Michael Engel, Rolf Ernst, Hermann Härtig, Lars Hedrich, Andreas Herkersdorf, Rüdiger Kapitza, Daniel Lohmann, Peter Marwedel, Marco Platzner, Wolfgang Rosenstiel, Ulf Schlichtmann, Olaf Spinczyk, Mehdi Tahoori, Jürgen Teich, Norbert Wehn and Hans-Joachim Wunderlich.Design and Architectures for Dependable Embedded Systems. In Proceedings of the International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS)Taipei, Taiwan, October 2011[BibTeX][PDF][Abstract]@inproceedings { SPP1500:11, author = {Henkel, J{\"o}rg and Bauer, Lars and Becker, Joachim and Bringmann, Oliver and Brinkschulte, Uwe and Chakraborty, Samarjit and Engel, Michael and Ernst, Rolf and H{\"a}rtig, Hermann and Hedrich, Lars and Herkersdorf, Andreas and Kapitza, R{\"u}diger and Lohmann, Daniel and Marwedel, Peter and Platzner, Marco and Rosenstiel, Wolfgang and Schlichtmann, Ulf and Spinczyk, Olaf and Tahoori, Mehdi and Teich, J{\"u}rgen and Wehn, Norbert and Wunderlich, Hans-Joachim}, title = {Design and Architectures for Dependable Embedded Systems}, booktitle = {Proceedings of the International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS)}, year = {2011}, address = {Taipei, Taiwan}, month = {oct}, keywords = {ders}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2011-esweek-marwedel.pdf}, confidential = {n}, abstract = {The paper presents an overview of a major research project on dependable embedded systems that has started in Fall 2010 and is running for a projected duration of six years. Aim is a dependability co-design' that spans various levels of abstraction in the design process of embedded systems starting from gate level through operating system, applications software to system architecture. In addition, we present a new classi cation on faults, errors, and failures.}, }The paper presents an overview of a major research project on dependable embedded systems that has started in Fall 2010 and is running for a projected duration of six years. Aim is a dependability co-design' that spans various levels of abstraction in the design process of embedded systems starting from gate level through operating system, applications software to system architecture. In addition, we present a new classi cation on faults, errors, and failures. Constantin Timm, Frank Weichert, David Fiedler, Christian Prasse, Heinrich Müller, Michael Hompel and Peter Marwedel.Decentralized Control of a Material Flow System enabled by an Embedded Computer Vision System. In Proceedings of the IEEE ICC 2011 Workshop on Embedding the Real World into the Future Internet June 2011[BibTeX][PDF][Abstract]@inproceedings { Timm:2011a, author = {Timm, Constantin and Weichert, Frank and Fiedler, David and Prasse, Christian and M{\"u}ller, Heinrich and Hompel, Michael and Marwedel, Peter}, title = {Decentralized Control of a Material Flow System enabled by an Embedded Computer Vision System}, booktitle = {Proceedings of the IEEE ICC 2011 Workshop on Embedding the Real World into the Future Internet}, year = {2011}, month = {jun}, file = {http://dx.doi.org/10.1109/iccw.2011.5963564}, confidential = {n}, abstract = {In this study, a novel sensor/actuator network approach for scalable automated facility logistics systems is presented. The approach comprises (1) a new sensor combination (cameras and few RFID scanners) for distributed detection, localization and identification of parcels and bins and (2) a novel middleware approach based on a service oriented architecture tailored towards the utilization in sensor/actuator networks. The latter enables a more flexible deploying of automated facility logistics system, while the former presents a novel departure for the detection and tracking of bins and parcels in automated facility logistics systems: light barriers and bar code readers are substituted by low-cost cameras, local conveyor mounted embedded evaluation units and few RFID readers. By combining vision-based systems and RFID systems, this approach can compensate for the drawbacks of each respective system. By utilizing a state-of-the-art middleware for connecting all computer systems of an automated facility logistics system the costs for deployment and reconfiguring the system can be decreased. The paper describes image processing methods specific to the given problem to both track and read visual markers attached to parcels or bins, processing the data on an embedded system and communication/middleware aspects between different computer systems of an automated facility logistics system such as a database holding the loading and routing information of the conveyed objects as a service for the different visual sensor units. In addition, information from the RFID system is used to narrow the decision space for detection and identification. From an economic point of view this approach enables high density of identification while lowering hardware costs compared to state of the art applications and, due to decentralized control, minimizing the effort for (re-)configuration. These innovations will make automated material flow systems more cost-efficient.}, }In this study, a novel sensor/actuator network approach for scalable automated facility logistics systems is presented. The approach comprises (1) a new sensor combination (cameras and few RFID scanners) for distributed detection, localization and identification of parcels and bins and (2) a novel middleware approach based on a service oriented architecture tailored towards the utilization in sensor/actuator networks. The latter enables a more flexible deploying of automated facility logistics system, while the former presents a novel departure for the detection and tracking of bins and parcels in automated facility logistics systems: light barriers and bar code readers are substituted by low-cost cameras, local conveyor mounted embedded evaluation units and few RFID readers. By combining vision-based systems and RFID systems, this approach can compensate for the drawbacks of each respective system. By utilizing a state-of-the-art middleware for connecting all computer systems of an automated facility logistics system the costs for deployment and reconfiguring the system can be decreased. The paper describes image processing methods specific to the given problem to both track and read visual markers attached to parcels or bins, processing the data on an embedded system and communication/middleware aspects between different computer systems of an automated facility logistics system such as a database holding the loading and routing information of the conveyed objects as a service for the different visual sensor units. In addition, information from the RFID system is used to narrow the decision space for detection and identification. From an economic point of view this approach enables high density of identification while lowering hardware costs compared to state of the art applications and, due to decentralized control, minimizing the effort for (re-)configuration. These innovations will make automated material flow systems more cost-efficient. Constantin Timm, Pascal Libuschewski, Dominic Siedhoff, Frank Weichert, Heinrich Müller and Peter Marwedel.Improving Nanoobject Detection in Optical Biosensor Data. In Proceedings of the 5th International Symposium on Bio- and Medical Informatics and Cybernetics, BMIC 2011 July 2011[BibTeX][PDF][Abstract]@inproceedings { Timm:2011b, author = {Timm, Constantin and Libuschewski, Pascal and Siedhoff, Dominic and Weichert, Frank and M{\"u}ller, Heinrich and Marwedel, Peter}, title = {Improving Nanoobject Detection in Optical Biosensor Data}, booktitle = {Proceedings of the 5th International Symposium on Bio- and Medical Informatics and Cybernetics, BMIC 2011}, year = {2011}, month = {July}, file = {http://www.iiis.org/CDs2011/CD2011SCI/BMIC_2011/PapersPdf/BA536CW.pdf}, confidential = {n}, abstract = {The importance of real-time capable mobile biosensors increases in face of rising numbers of global virus epidemics. Such biosensors can be used for on-site diagnosis, e.g. at airports, to prevent further spread of virus-transmitted diseases, by answering the question whether or not a sample contains a certain virus. In-depth laboratory analysis might furthermore demand for measurements of the concentration of virus particles in a sample. The novel PAMONO sensor technique allows for accomplishing both tasks. One of its basic prerequisites is an efficient analysis of the biosensor image data by means of digital image processing and classification. In this study, we present a high performance approach to this analysis: The diagnosis whether a virus occurs in the sample can be carried out in real-time with high accuracy. An estimate of the concentration can be obtained in real-time as well, if that concentration is not too high. The contribution of this work is an optimization of our processing pipeline used for PAMONO sensor data analysis. The following objectives are optimized: detection-quality, speed and consumption of resources (e.g. energy, memory). Thus our approach respects the constraints imposed by medical applicability, as well as the constraints on resource consumption arising in embedded systems. The parameters to be optimized are descriptive (virus appearance parameters) and hardware-related (design space exploration). }, }The importance of real-time capable mobile biosensors increases in face of rising numbers of global virus epidemics. Such biosensors can be used for on-site diagnosis, e.g. at airports, to prevent further spread of virus-transmitted diseases, by answering the question whether or not a sample contains a certain virus. In-depth laboratory analysis might furthermore demand for measurements of the concentration of virus particles in a sample. The novel PAMONO sensor technique allows for accomplishing both tasks. One of its basic prerequisites is an efficient analysis of the biosensor image data by means of digital image processing and classification. In this study, we present a high performance approach to this analysis: The diagnosis whether a virus occurs in the sample can be carried out in real-time with high accuracy. An estimate of the concentration can be obtained in real-time as well, if that concentration is not too high. The contribution of this work is an optimization of our processing pipeline used for PAMONO sensor data analysis. The following objectives are optimized: detection-quality, speed and consumption of resources (e.g. energy, memory). Thus our approach respects the constraints imposed by medical applicability, as well as the constraints on resource consumption arising in embedded systems. The parameters to be optimized are descriptive (virus appearance parameters) and hardware-related (design space exploration). Michael Engel, Florian Schmoll, Andreas Heinig and Peter Marwedel.Temporal Properties of Error Handling for Multimedia Applications. In Proceedings of the 14th ITG Conference on Electronic Media TechnologyDortmund / Germany, February 2011[BibTeX][PDF][Abstract]@inproceedings { engel:11:itg, author = {Engel, Michael and Schmoll, Florian and Heinig, Andreas and Marwedel, Peter}, title = {Temporal Properties of Error Handling for Multimedia Applications}, booktitle = {Proceedings of the 14th ITG Conference on Electronic Media Technology}, year = {2011}, address = {Dortmund / Germany}, month = {feb}, keywords = {ders}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2011-itg-engel.pdf}, confidential = {n}, abstract = {In embedded consumer electronics devices, cost pressure is one of the driving design objectives. Devices that handle multimedia information, like DVD players or digital video cameras require high computing performance and real- time capabilities while adhering to the cost restrictions. The cost pressure often results in system designs that barely exceed the minimum requirements for such a system. Thus, hardware-based fault tolerance methods frequently are ignored due to their cost overhead. However, the amount of transient faults showing up in semiconductor-based systems is expected to increase sharply in the near future. Thus, low- overhead methods to correct related errors in such systems are required. Considering restrictions in processing speed, the real-time properties of a system with added error handling are of special interest. In this paper, we present our approach to flexible error handling and discuss the challenges as well as the inherent timing dependencies to deploy it in a typical soft real- time multimedia system, a H.264 video decoder.}, }In embedded consumer electronics devices, cost pressure is one of the driving design objectives. Devices that handle multimedia information, like DVD players or digital video cameras require high computing performance and real- time capabilities while adhering to the cost restrictions. The cost pressure often results in system designs that barely exceed the minimum requirements for such a system. Thus, hardware-based fault tolerance methods frequently are ignored due to their cost overhead. However, the amount of transient faults showing up in semiconductor-based systems is expected to increase sharply in the near future. Thus, low- overhead methods to correct related errors in such systems are required. Considering restrictions in processing speed, the real-time properties of a system with added error handling are of special interest. In this paper, we present our approach to flexible error handling and discuss the challenges as well as the inherent timing dependencies to deploy it in a typical soft real- time multimedia system, a H.264 video decoder. Emanuele Cannella, Lorenzo Di Gregorio, Leandro Fiorin, Menno Lindwer, Paolo Meloni, Olaf Neugebauer and Andy D. Pimentel.Towards an ESL Design Framework for Adaptive and Fault-tolerant MPSoCs: MADNESS or not?. In Proceedings of the 9th IEEE/ACM Symposium on Embedded Systems for Real-Time Multimedia (ESTIMedia'11)Taipei, Taiwan, October 2011[BibTeX][PDF][Abstract]@inproceedings { madness.2011, author = {Cannella, Emanuele and Gregorio, Lorenzo Di and Fiorin, Leandro and Lindwer, Menno and Meloni, Paolo and Neugebauer, Olaf and Pimentel, Andy D.}, title = {Towards an ESL Design Framework for Adaptive and Fault-tolerant MPSoCs: MADNESS or not?}, booktitle = {Proceedings of the 9th IEEE/ACM Symposium on Embedded Systems for Real-Time Multimedia (ESTIMedia'11)}, year = {2011}, address = {Taipei, Taiwan}, month = {October}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2011-estimedia-madness.pdf}, confidential = {n}, abstract = {The MADNESS project aims at the definition of innovative system-level design methodologies for embedded MP-SoCs, extending the classic concept of design space exploration in multi-application domains to cope with high heterogeneity, technology scaling and system reliability. The main goal of the project is to provide a framework able to guide designers and researchers to the optimal composition of embedded MPSoC architectures, according to the requirements and the features of a given target application field. The proposed approach will tackle the new challenges, related to both architecture and design methodologies, arising with the technology scaling, the system reliability and the ever-growing computational needs of modern applications. The methodologies proposed with this project act at different levels of the design flow, enhancing the state-of-the art with novel features in system-level synthesis, architectural evaluation and prototyping. Support for fault resilience and efficient adaptive runtime management is introduced at hardware and middleware level, and considered by the system-level synthesis as one of the optimization factors to be taken into account. This paper presents the first stable results obtained in the MADNESS project, already demonstrating the effectiveness of the proposed methods.}, }The MADNESS project aims at the definition of innovative system-level design methodologies for embedded MP-SoCs, extending the classic concept of design space exploration in multi-application domains to cope with high heterogeneity, technology scaling and system reliability. The main goal of the project is to provide a framework able to guide designers and researchers to the optimal composition of embedded MPSoC architectures, according to the requirements and the features of a given target application field. The proposed approach will tackle the new challenges, related to both architecture and design methodologies, arising with the technology scaling, the system reliability and the ever-growing computational needs of modern applications. The methodologies proposed with this project act at different levels of the design flow, enhancing the state-of-the art with novel features in system-level synthesis, architectural evaluation and prototyping. Support for fault resilience and efficient adaptive runtime management is introduced at hardware and middleware level, and considered by the system-level synthesis as one of the optimization factors to be taken into account. This paper presents the first stable results obtained in the MADNESS project, already demonstrating the effectiveness of the proposed methods. Michael Engel, Florian Schmoll, Andreas Heinig and Peter Marwedel.Unreliable yet Useful -- Reliability Annotations for Data in Cyber-Physical Systems. In Proceedings of the 2011 Workshop on Software Language Engineering for Cyber-physical Systems (WS4C)Berlin / Germany, October 2011[BibTeX][PDF][Abstract]@inproceedings { engel:11:ws4c, author = {Engel, Michael and Schmoll, Florian and Heinig, Andreas and Marwedel, Peter}, title = {Unreliable yet Useful -- Reliability Annotations for Data in Cyber-Physical Systems}, booktitle = {Proceedings of the 2011 Workshop on Software Language Engineering for Cyber-physical Systems (WS4C)}, year = {2011}, address = {Berlin / Germany}, month = {oct}, keywords = {ders}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2011-ws4c-engel.pdf}, confidential = {n}, abstract = {Today, cyber-physical systems face yet another challenge in addition to the traditional constraints in energy, computing power, or memory. Shrinking semiconductor structure sizes and supply voltages imply that the number of errors that manifest themselves in a system will rise significantly. Most CP systems have to survive errors, but many systems do not have sufficient resources to correct all errors that show up. Thus, it is important to spend the available resources on handling errors with the most critical effect. We propose an unreliability'' annotation for data types in C programs that indicates if an error showing up in a specific variable or data structure will possibly cause a severe problem like a program crash or might only show rather negligible effects, e.g., a discolored pixel in video decoding. This classification of data is supported by static analysis methods that verify if the value contained in a variable marked as unreliable does not end up as part of a critical operation, e.g., an array index or loop termination condition. This classification enables several approaches to flexible error handling. For example, a CP system designer might choose to selectively safeguard variables marked as non-unreliable or to employ memories with different reliabilty properties to store the respective values.}, }Today, cyber-physical systems face yet another challenge in addition to the traditional constraints in energy, computing power, or memory. Shrinking semiconductor structure sizes and supply voltages imply that the number of errors that manifest themselves in a system will rise significantly. Most CP systems have to survive errors, but many systems do not have sufficient resources to correct all errors that show up. Thus, it is important to spend the available resources on handling errors with the most critical effect. We propose an "unreliability" annotation for data types in C programs that indicates if an error showing up in a specific variable or data structure will possibly cause a severe problem like a program crash or might only show rather negligible effects, e.g., a discolored pixel in video decoding. This classification of data is supported by static analysis methods that verify if the value contained in a variable marked as unreliable does not end up as part of a critical operation, e.g., an array index or loop termination condition. This classification enables several approaches to flexible error handling. For example, a CP system designer might choose to selectively safeguard variables marked as non-unreliable or to employ memories with different reliabilty properties to store the respective values. Heiko Falk and Helena Kotthaus.WCET-driven Cache-aware Code Positioning. In Proceedings of the International Conference on Compilers, Architectures and Synthesis for Embedded Systems (CASES), pages 145-154Taipei, Taiwan, October 2011[BibTeX][PDF][Abstract]@inproceedings { falk:11:cases, author = {Falk, Heiko and Kotthaus, Helena}, title = {WCET-driven Cache-aware Code Positioning}, booktitle = {Proceedings of the International Conference on Compilers, Architectures and Synthesis for Embedded Systems (CASES)}, year = {2011}, pages = {145-154}, address = {Taipei, Taiwan}, month = {oct}, keywords = {wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2011-cases_1.pdf}, confidential = {n}, abstract = {Code positioning is a well-known compiler optimization aiming at the improvement of the instruction cache behavior. A contiguous mapping of code fragments in memory avoids overlapping of cache sets and thus decreases the number of cache conflict misses. We present a novel cache-aware code positioning optimization driven by worst-case execution time (WCET) information. For this purpose, we introduce a formal cache model based on a conflict graph which is able to capture a broad class of cache architectures. This cache model is combined with a formal WCET timing model, resulting in a cache conflict graph weighted with WCET data. This conflict graph is then exploited by heuristics for code positioning of both basic blocks and entire functions. Code positioning is able to decrease the accumulated cache misses for a total of 18 real-life benchmarks by 15.5% on average for an automotive processor featuring a 2-way set-associative cache. These cache miss reductions translate to average WCET reductions by 6.1%. For direct-mapped caches, even larger savings of 18.8% (cache misses) and 9.0% (WCET) were achieved. }, }Code positioning is a well-known compiler optimization aiming at the improvement of the instruction cache behavior. A contiguous mapping of code fragments in memory avoids overlapping of cache sets and thus decreases the number of cache conflict misses. We present a novel cache-aware code positioning optimization driven by worst-case execution time (WCET) information. For this purpose, we introduce a formal cache model based on a conflict graph which is able to capture a broad class of cache architectures. This cache model is combined with a formal WCET timing model, resulting in a cache conflict graph weighted with WCET data. This conflict graph is then exploited by heuristics for code positioning of both basic blocks and entire functions. Code positioning is able to decrease the accumulated cache misses for a total of 18 real-life benchmarks by 15.5% on average for an automotive processor featuring a 2-way set-associative cache. These cache miss reductions translate to average WCET reductions by 6.1%. For direct-mapped caches, even larger savings of 18.8% (cache misses) and 9.0% (WCET) were achieved. Heiko Falk, Norman Schmitz and Florian Schmoll.WCET-aware Register Allocation based on Integer-Linear Programming. In Proceedings of the 23rd Euromicro Conference on Real-Time Systems (ECRTS), pages 13-22Porto / Portugal, July 2011[BibTeX][PDF][Abstract]@inproceedings { falk:11:ecrts, author = {Falk, Heiko and Schmitz, Norman and Schmoll, Florian}, title = {WCET-aware Register Allocation based on Integer-Linear Programming}, booktitle = {Proceedings of the 23rd Euromicro Conference on Real-Time Systems (ECRTS)}, year = {2011}, pages = {13-22}, address = {Porto / Portugal}, month = {jul}, keywords = {wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2011-ecrts_2.pdf}, confidential = {n}, abstract = {Current compilers lack precise timing models guiding their built-in optimizations. Hence, compilers apply ad-hoc heuristics during optimization to improve code quality. One of the most important optimizations is register allocation. Many compilers heuristically decide when and where to spill a register to memory, without having a clear understanding of the impact of such spill code on a program's runtime. This paper presents an integer-linear programming \textit{(ILP)} based register allocator that uses precise worst-case execution time \textit{(WCET)} models. Using this WCET timing data, the compiler avoids spill code generation along the critical path defining a program's WCET. To the best of our knowledge, this paper is the first one to present a WCET-aware ILP-based register allocator. Our results underline the effectiveness of the proposed techniques. For a total of 55 realistic benchmarks, we reduced WCETs by 20.2\% on average and ACETs by 14\%, compared to a standard graph coloring allocator. Furthermore, our ILP-based register allocator outperforms a WCET-aware graph coloring allocator by more than a factor of two for the considered benchmarks, while requiring less runtime.}, }Current compilers lack precise timing models guiding their built-in optimizations. Hence, compilers apply ad-hoc heuristics during optimization to improve code quality. One of the most important optimizations is register allocation. Many compilers heuristically decide when and where to spill a register to memory, without having a clear understanding of the impact of such spill code on a program's runtime. This paper presents an integer-linear programming (ILP) based register allocator that uses precise worst-case execution time (WCET) models. Using this WCET timing data, the compiler avoids spill code generation along the critical path defining a program's WCET. To the best of our knowledge, this paper is the first one to present a WCET-aware ILP-based register allocator. Our results underline the effectiveness of the proposed techniques. For a total of 55 realistic benchmarks, we reduced WCETs by 20.2% on average and ACETs by 14%, compared to a standard graph coloring allocator. Furthermore, our ILP-based register allocator outperforms a WCET-aware graph coloring allocator by more than a factor of two for the considered benchmarks, while requiring less runtime. Constantin Timm, Frank Weichert, Peter Marwedel and Heinrich Müller.Multi-Objective Local Instruction Scheduling for GPGPU Applications. In Proceedings of the International Conference on Parallel and Distributed Computing Systems 2011 (PDCS) Dallas, USA, December 2011[BibTeX][PDF][Abstract]@inproceedings { timm:2011:pdcs, author = {Timm, Constantin and Weichert, Frank and Marwedel, Peter and M{\"u}ller, Heinrich}, title = {Multi-Objective Local Instruction Scheduling for GPGPU Applications}, booktitle = {Proceedings of the International Conference on Parallel and Distributed Computing Systems 2011 (PDCS) }, year = {2011}, address = {Dallas, USA}, month = {December}, publisher = {IASTED/ACTA Press}, file = {http://www.actapress.com/PaperInfo.aspx?paperId=453074}, confidential = {n}, abstract = {In this paper, a new optimization approach (MOLIS: Multi-Objective Local Instruction Scheduling) is presented which maximizes the performance and minimizes the energy consumption of GPGPU applications. The design process of writing efficient GPGPU applications is time-consuming. This disadvantage mainly arises from the fact that the optimization of an application is accomplished in an expensive trial-and-error manner without efficient compiler support. Especially, efficient register utilization and load balancing of the concurrently working instruction and memory pipelines were not considered in the compile process up to now. Another drawback of the state-of-the-art GPGPU application design process is that energy consumption is not taken into account, which is important in the face of green computing. In order to optimize performance and energy consumption simultaneously, a multi-objective genetic algorithm was utilized. The optimization of GPGPU applications in MOLIS employs local instruction scheduling methods. The optimization potential of MOLIS was evaluated by profiling the runtime and the energy consumption on a real platform. The optimization approach was tested with several real-world benchmarks stemming from the Nvidia CUDA examples, the VSIPL-GPGPU-Library and the Rodinia benchmark suite. By applying MOLIS to the real-world benchmarks, up to 9% energy and 12% runtime can be saved.}, }In this paper, a new optimization approach (MOLIS: Multi-Objective Local Instruction Scheduling) is presented which maximizes the performance and minimizes the energy consumption of GPGPU applications. The design process of writing efficient GPGPU applications is time-consuming. This disadvantage mainly arises from the fact that the optimization of an application is accomplished in an expensive trial-and-error manner without efficient compiler support. Especially, efficient register utilization and load balancing of the concurrently working instruction and memory pipelines were not considered in the compile process up to now. Another drawback of the state-of-the-art GPGPU application design process is that energy consumption is not taken into account, which is important in the face of green computing. In order to optimize performance and energy consumption simultaneously, a multi-objective genetic algorithm was utilized. The optimization of GPGPU applications in MOLIS employs local instruction scheduling methods. The optimization potential of MOLIS was evaluated by profiling the runtime and the energy consumption on a real platform. The optimization approach was tested with several real-world benchmarks stemming from the Nvidia CUDA examples, the VSIPL-GPGPU-Library and the Rodinia benchmark suite. By applying MOLIS to the real-world benchmarks, up to 9% energy and 12% runtime can be saved. Timon Kelter, Heiko Falk, Peter Marwedel, Sudipta Chattopadhyay and Abhik Roychoudhury.Bus-Aware Multicore WCET Analysis through TDMA Offset Bounds. In Proceedings of the 23rd Euromicro Conference on Real-Time Systems (ECRTS), pages 3-12Porto / Portugal, July 2011[BibTeX][PDF][Abstract]@inproceedings { kelter:11:ecrts, author = {Kelter, Timon and Falk, Heiko and Marwedel, Peter and Chattopadhyay, Sudipta and Roychoudhury, Abhik}, title = {Bus-Aware Multicore WCET Analysis through TDMA Offset Bounds}, booktitle = {Proceedings of the 23rd Euromicro Conference on Real-Time Systems (ECRTS)}, year = {2011}, pages = {3-12}, address = {Porto / Portugal}, month = {jul}, keywords = {wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2011-ecrts_1.pdf}, confidential = {n}, abstract = {In the domain of real-time systems, the analysis of the timing behavior of programs is crucial for guaranteeing the schedulability and thus the safeness of a system. Static analyses of the \textit{WCET} (Worst-Case Execution Time) have proven to be a key element for timing analysis, as they provide safe upper bounds on a program's execution time. For single-core systems, industrial-strength WCET analyzers are already available, but up to now, only first proposals have been made to analyze the WCET in multicore systems, where the different cores may interfere during the access to shared resources. An important example for this are shared buses which connect the cores to a shared main memory. The time to gain access to the shared bus may vary significantly, depending on the used bus arbitration protocol and the access timings. In this paper, we propose a new technique for analyzing the duration of accesses to shared buses. We implemented a prototype tool which uses the new analysis and tested it on a set of realworld benchmarks. Results demonstrate that our analysis achieves the same precision as the best existing approach while drastically outperforming it in matters of analysis time.}, }In the domain of real-time systems, the analysis of the timing behavior of programs is crucial for guaranteeing the schedulability and thus the safeness of a system. Static analyses of the WCET (Worst-Case Execution Time) have proven to be a key element for timing analysis, as they provide safe upper bounds on a program's execution time. For single-core systems, industrial-strength WCET analyzers are already available, but up to now, only first proposals have been made to analyze the WCET in multicore systems, where the different cores may interfere during the access to shared resources. An important example for this are shared buses which connect the cores to a shared main memory. The time to gain access to the shared bus may vary significantly, depending on the used bus arbitration protocol and the access timings. In this paper, we propose a new technique for analyzing the duration of accesses to shared buses. We implemented a prototype tool which uses the new analysis and tested it on a set of realworld benchmarks. Results demonstrate that our analysis achieves the same precision as the best existing approach while drastically outperforming it in matters of analysis time. Sascha Plazar, Jan C. Kleinsorge, Heiko Falk and Peter Marwedel.WCET-driven Branch Prediction aware Code Positioning. In Proceedings of the International Conference on Compilers, Architectures and Synthesis for Embedded Systems (CASES), pages 165-174Taipei, Taiwan, October 2011[BibTeX][PDF][Abstract]@inproceedings { plazar:11:cases, author = {Plazar, Sascha and Kleinsorge, Jan C. and Falk, Heiko and Marwedel, Peter}, title = {WCET-driven Branch Prediction aware Code Positioning}, booktitle = {Proceedings of the International Conference on Compilers, Architectures and Synthesis for Embedded Systems (CASES)}, year = {2011}, pages = {165-174}, address = {Taipei, Taiwan}, month = {oct}, keywords = {wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2011-cases_2.pdf}, confidential = {n}, abstract = {In the past decades, embedded system designers moved from simple, predictable system designs towards complex systems equipped with caches, branch prediction units and speculative execution. This step was necessary in order to fulfill increasing requirements on computational power. Static analysis techniques considering such speculative units had to be developed to allow the estimation of an upper bound of the execution time of a program. This bound is called worst-case execution time (WCET). Its knowledge is crucial to verify whether hard real-time systems satisfy their timing constraints, and the WCET is a key parameter for the design of embedded systems. In this paper, we propose a WCET-driven branch prediction aware optimization which reorders basic blocks of a function in order to reduce the amount of jump instructions and mispredicted branches. We employed a genetic algorithm which rearranges basic blocks in order to decrease the WCET of a program. This enables a first estimation of the possible optimization potential at the cost of high optimization runtimes. To avoid time consuming repetitive WCET analyses, we developed a new algorithm employing integer-linear programming (ILP). The ILP models the worst-case execution path (WCEP) of a program and takes branch prediction effects into account. This algorithm enables short optimization runtimes at slightly decreased optimization results. In a case study, the genetic algorithm is able to reduce the benchmarks’ WCET by up to 24.7% whereas our ILP-based approach is able to decrease the WCET by up to 20.0%. }, }In the past decades, embedded system designers moved from simple, predictable system designs towards complex systems equipped with caches, branch prediction units and speculative execution. This step was necessary in order to fulfill increasing requirements on computational power. Static analysis techniques considering such speculative units had to be developed to allow the estimation of an upper bound of the execution time of a program. This bound is called worst-case execution time (WCET). Its knowledge is crucial to verify whether hard real-time systems satisfy their timing constraints, and the WCET is a key parameter for the design of embedded systems. In this paper, we propose a WCET-driven branch prediction aware optimization which reorders basic blocks of a function in order to reduce the amount of jump instructions and mispredicted branches. We employed a genetic algorithm which rearranges basic blocks in order to decrease the WCET of a program. This enables a first estimation of the possible optimization potential at the cost of high optimization runtimes. To avoid time consuming repetitive WCET analyses, we developed a new algorithm employing integer-linear programming (ILP). The ILP models the worst-case execution path (WCEP) of a program and takes branch prediction effects into account. This algorithm enables short optimization runtimes at slightly decreased optimization results. In a case study, the genetic algorithm is able to reduce the benchmarks’ WCET by up to 24.7% whereas our ILP-based approach is able to decrease the WCET by up to 20.0%. Daniel Cordes, Andreas Heinig, Peter Marwedel and Arindam Mallik.Automatic Extraction of Pipeline Parallelism for Embedded Software Using Linear Programming. In Proceedings of the 17th IEEE International Conference on Parallel and Distributed Systems (ICPADS), 2011, pages 699 -706Tainan, Taiwan, December 2011[BibTeX][PDF][Abstract]@inproceedings { cordes:2011:icpads, author = {Cordes, Daniel and Heinig, Andreas and Marwedel, Peter and Mallik, Arindam}, title = {Automatic Extraction of Pipeline Parallelism for Embedded Software Using Linear Programming}, booktitle = {Proceedings of the 17th IEEE International Conference on Parallel and Distributed Systems (ICPADS), 2011}, year = {2011}, pages = {699 -706}, address = {Tainan, Taiwan}, month = {dec}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2011-icpads-cordes.pdf}, confidential = {n}, abstract = {The complexity and performance requirements of embedded software are continuously increasing, making Multiprocessor System-on-Chip (MPSoC) architectures more and more important in the domain of embedded and cyber-physical systems. Using multiple cores in a single system reduces problems concerning energy consumption, heat dissipation, and increases performance. Nevertheless, these benefits do not come for free. Porting existing, mostly sequential, applications to MPSoCs requires extracting efficient parallelism to utilize all available cores. Many embedded applications, like network services and multimedia tasks for voice-, image- and video processing, are operating on data streams and thus have a streaming-based structure. Despite the abundance of parallelism in streaming applications, it is a non-trivial task to split and efficiently map sequential applications to MPSoCs. Therefore, we present an algorithm which automatically extracts pipeline parallelism from sequential ANSI-C applications. The presented tool employs an integer linear programming (ILP) based approach enriched with an adequate cost model to automatically control the granularity of the parallelization. By applying our tool to real-life applications, it can be shown that our approach is able to speed up applications by a factor of up to 3.9x on a four-core MPSoC architecture, compared to a sequential execution.}, }The complexity and performance requirements of embedded software are continuously increasing, making Multiprocessor System-on-Chip (MPSoC) architectures more and more important in the domain of embedded and cyber-physical systems. Using multiple cores in a single system reduces problems concerning energy consumption, heat dissipation, and increases performance. Nevertheless, these benefits do not come for free. Porting existing, mostly sequential, applications to MPSoCs requires extracting efficient parallelism to utilize all available cores. Many embedded applications, like network services and multimedia tasks for voice-, image- and video processing, are operating on data streams and thus have a streaming-based structure. Despite the abundance of parallelism in streaming applications, it is a non-trivial task to split and efficiently map sequential applications to MPSoCs. Therefore, we present an algorithm which automatically extracts pipeline parallelism from sequential ANSI-C applications. The presented tool employs an integer linear programming (ILP) based approach enriched with an adequate cost model to automatically control the granularity of the parallelization. By applying our tool to real-life applications, it can be shown that our approach is able to speed up applications by a factor of up to 3.9x on a four-core MPSoC architecture, compared to a sequential execution. Peter Marwedel and Michael Engel.Embedded System Design 2.0: Rationale Behind a Textbook Revision. In Proceedings of Workshop on Embedded Systems Education (WESE)Taipei, Taiwan, October 2011[BibTeX][PDF][Abstract]@inproceedings { marwedel:2011:wese, author = {Marwedel, Peter and Engel, Michael}, title = {Embedded System Design 2.0: Rationale Behind a Textbook Revision}, booktitle = {Proceedings of Workshop on Embedded Systems Education (WESE)}, year = {2011}, address = {Taipei, Taiwan}, month = {October}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2011-wese-marwedel.pdf}, confidential = {n}, abstract = {Seven years after its first release, it became necessary to publish a new edition of the author’s text book on embedded system design. This paper explains the key changes that were incorporated into the second edition. These changes reflect seven years of teaching of the subject, with two courses every year. The rationale behind these changes can also be found in the paper. In this way, the paper also reflects changes in the area over time, while the area becomes more mature. The paper helps understanding why a particular topic is included in this curriculum for embedded system design and why a certain structure of the course is suggested.}, }Seven years after its first release, it became necessary to publish a new edition of the author’s text book on embedded system design. This paper explains the key changes that were incorporated into the second edition. These changes reflect seven years of teaching of the subject, with two courses every year. The rationale behind these changes can also be found in the paper. In this way, the paper also reflects changes in the area over time, while the area becomes more mature. The paper helps understanding why a particular topic is included in this curriculum for embedded system design and why a certain structure of the course is suggested. Horst Schirmeier, Jens Neuhalfen, Ingo Korb, Olaf Spinczyk and Michael Engel.RAMpage: Graceful Degradation Management for Memory Errors in Commodity Linux Servers. In Proceedings of the 11th IEEE Pacific Rim International Symposium on Dependable Computing (PRDC 2011)Pasadena, USA, 2011[BibTeX][PDF][Abstract]@inproceedings { schirmeier:11:prdc, author = {Schirmeier, Horst and Neuhalfen, Jens and Korb, Ingo and Spinczyk, Olaf and Engel, Michael}, title = {RAMpage: Graceful Degradation Management for Memory Errors in Commodity Linux Servers}, booktitle = {Proceedings of the 11th IEEE Pacific Rim International Symposium on Dependable Computing (PRDC 2011)}, year = {2011}, address = {Pasadena, USA}, organization = {IEEE Computer Society Press}, keywords = {ders}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2011-prdc-schirmeier.pdf}, confidential = {n}, abstract = {Memory errors are a major source of reliability problems in current computers. Undetected errors may result in program termination, or, even worse, silent data corruption. Recent studies have shown that the frequency of permanent memory errors is an order of magnitude higher than previously assumed and regularly affects everyday operation. Often, neither additional circuitry to support hardware-based error detection nor downtime for performing hardware tests can be afforded. In the case of permanent memory errors, a system faces two challenges: detecting errors as early as possible and handling them while avoiding system downtime. To increase system reliability, we have developed RAMpage, an online memory testing infrastructure for commodity x86-64- based Linux servers, which is capable of efficiently detecting memory errors and which provides graceful degradation by withdrawing affected memory pages from further use. We describe the design and implementation of RAMpage and present results of an extensive qualitative as well as quantitative evaluation.}, }Memory errors are a major source of reliability problems in current computers. Undetected errors may result in program termination, or, even worse, silent data corruption. Recent studies have shown that the frequency of permanent memory errors is an order of magnitude higher than previously assumed and regularly affects everyday operation. Often, neither additional circuitry to support hardware-based error detection nor downtime for performing hardware tests can be afforded. In the case of permanent memory errors, a system faces two challenges: detecting errors as early as possible and handling them while avoiding system downtime. To increase system reliability, we have developed RAMpage, an online memory testing infrastructure for commodity x86-64- based Linux servers, which is capable of efficiently detecting memory errors and which provides graceful degradation by withdrawing affected memory pages from further use. We describe the design and implementation of RAMpage and present results of an extensive qualitative as well as quantitative evaluation. Jan C. Kleinsorge, Heiko Falk and Peter Marwedel.A Synergetic Approach To Accurate Analysis Of Cache-Related Preemption Delay. In Proceedings of the International Conference on Embedded Software (EMSOFT), pages 329-338Taipei, Taiwan, October 2011[BibTeX][PDF][Abstract]@inproceedings { kleinsorge:11:emsoft, author = {Kleinsorge, Jan C. and Falk, Heiko and Marwedel, Peter}, title = {A Synergetic Approach To Accurate Analysis Of Cache-Related Preemption Delay}, booktitle = {Proceedings of the International Conference on Embedded Software (EMSOFT)}, year = {2011}, pages = {329-338}, address = {Taipei, Taiwan}, month = {oct}, keywords = {wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2011-emsoft.pdf}, confidential = {n}, abstract = {The worst-case execution time (WCET) of a task denotes the largest possible execution time for all possible inputs and thus, hardware states. For non-preemptive multitask scheduling, techniques for the static estimation of safe upper bounds have been subject to industrial practice for years. For preemptive scheduling however, the isolated analysis of tasks becomes imprecise as interferences among tasks cannot be considered with sufficient precision. For such scenarios, the cache-related preemption delay (CRPD) denotes a key metric as it reflects the e ects of preemptions on the execution behavior of a single task. Until recently, proposals for CRPD analyses were often limited to direct mapped caches or comparably imprecise for k-way set-associative caches. In this paper, we propose how the current best techniques for CRPD analysis, which have only been proposed separately and for di erent aspects of the analysis can be brought together to construct an efficient CRPD analysis with unique properties. Moreover, along the construction, we propose several diff erent enhancements to the methods employed. We also exploit that in a complete approach, analysis steps are synergetic and can be combined into a single analysis pass solving all formerly separate steps at once. In addition, we argue that it is often sufficient to carry out the combined analysis on basic block bounds, which further lowers the overall complexity. The result is a proposal for a fast CRPD analysis of very high accuracy. }, }The worst-case execution time (WCET) of a task denotes the largest possible execution time for all possible inputs and thus, hardware states. For non-preemptive multitask scheduling, techniques for the static estimation of safe upper bounds have been subject to industrial practice for years. For preemptive scheduling however, the isolated analysis of tasks becomes imprecise as interferences among tasks cannot be considered with sufficient precision. For such scenarios, the cache-related preemption delay (CRPD) denotes a key metric as it reflects the e ects of preemptions on the execution behavior of a single task. Until recently, proposals for CRPD analyses were often limited to direct mapped caches or comparably imprecise for k-way set-associative caches. In this paper, we propose how the current best techniques for CRPD analysis, which have only been proposed separately and for di erent aspects of the analysis can be brought together to construct an efficient CRPD analysis with unique properties. Moreover, along the construction, we propose several diff erent enhancements to the methods employed. We also exploit that in a complete approach, analysis steps are synergetic and can be combined into a single analysis pass solving all formerly separate steps at once. In addition, we argue that it is often sufficient to carry out the combined analysis on basic block bounds, which further lowers the overall complexity. The result is a proposal for a fast CRPD analysis of very high accuracy. Samarjit Chakraborty, Marco Di Natale, Heiko Falk, Martin Lukasiewyzc and Frank Slomka.Timing and Schedulability Analysis for Distributed Automotive Control Applications. In Tutorial at the International Conference on Embedded Software (EMSOFT), pages 349-350Taipei, Taiwan, October 2011[BibTeX][PDF][Abstract]@inproceedings { falk:11:emsoft_tutorial, author = {Chakraborty, Samarjit and Di Natale, Marco and Falk, Heiko and Lukasiewyzc, Martin and Slomka, Frank}, title = {Timing and Schedulability Analysis for Distributed Automotive Control Applications}, booktitle = {Tutorial at the International Conference on Embedded Software (EMSOFT)}, year = {2011}, pages = {349-350}, address = {Taipei, Taiwan}, month = {oct}, keywords = {wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2011-emsoft_tutorial.pdf}, confidential = {n}, abstract = {High-end cars today consist of more than 100 electronic control units (ECUs) that are connected to a set of sensors and actuators and run multiple distributed control applications. The design ow of such architectures consists of specifying control applications as Simulink/State flow models, followed by generating code from them and finally mapping such code onto multiple ECUs. In addition, the scheduling policies and parameters on both the ECUs and the communication buses over which they communicate also need to be specified. These policies and parameters are computed from high-level timing and control performance constraints. The proposed tutorial will cover different aspects of this design flow, with a focus on timing and schedulability problems. After reviewing the basic concepts of worst-case execution time analysis and schedulability analysis, we will discuss the differences between meeting timing constraints (as in classical real-time systems) and meeting control performance constraints (e.g., stability, steady and transient state performance). We will then describe various control performance related schedulability analysis techniques and how they may be tied to model-based software development. Finally, we will discuss various schedule synthesis techniques, both for ECUs as well as for communication protocols like FlexRay, so that control performance constraints specified at the model-level may be satisfied. Throughout the tutorial different commercial as well as academic tools will be discussed and demonstrated. }, }High-end cars today consist of more than 100 electronic control units (ECUs) that are connected to a set of sensors and actuators and run multiple distributed control applications. The design ow of such architectures consists of specifying control applications as Simulink/State flow models, followed by generating code from them and finally mapping such code onto multiple ECUs. In addition, the scheduling policies and parameters on both the ECUs and the communication buses over which they communicate also need to be specified. These policies and parameters are computed from high-level timing and control performance constraints. The proposed tutorial will cover different aspects of this design flow, with a focus on timing and schedulability problems. After reviewing the basic concepts of worst-case execution time analysis and schedulability analysis, we will discuss the differences between meeting timing constraints (as in classical real-time systems) and meeting control performance constraints (e.g., stability, steady and transient state performance). We will then describe various control performance related schedulability analysis techniques and how they may be tied to model-based software development. Finally, we will discuss various schedule synthesis techniques, both for ECUs as well as for communication protocols like FlexRay, so that control performance constraints specified at the model-level may be satisfied. Throughout the tutorial different commercial as well as academic tools will be discussed and demonstrated. Peter Marwedel, Jürgen Teich, Georgia Kouveli, Iuliana Bacivarov, Lothar Thiele, Soonhoi Ha, Chanhee Lee, Qiang Xu and Lin Huang.Mapping of Applications to MPSoCs. In Proceedings of the International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS)Taipei, Taiwan, October 2011[BibTeX][PDF][Abstract]@inproceedings { marwedel:2011:codes-isss2, author = {Marwedel, Peter and Teich, J\"urgen and Kouveli, Georgia and Bacivarov, Iuliana and Thiele, Lothar and Ha, Soonhoi and Lee, Chanhee and Xu, Qiang and Huang, Lin}, title = {Mapping of Applications to MPSoCs}, booktitle = {Proceedings of the International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS)}, year = {2011}, address = {Taipei, Taiwan}, month = {October}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2011-codes-isss-marwedel.pdf}, confidential = {n}, abstract = {The advent of embedded many-core architectures results in the need to come up with techniques for mapping embedded applications onto such architectures. This paper presents a representative set of such techniques. The techniques focus on optimizing performance, temperature distribution, reliability and fault tolerance for various models.}, }The advent of embedded many-core architectures results in the need to come up with techniques for mapping embedded applications onto such architectures. This paper presents a representative set of such techniques. The techniques focus on optimizing performance, temperature distribution, reliability and fault tolerance for various models. Robert Pyka, Felipe Klein, Peter Marwedel and Stylianos Mamagkakis.Versatile System-Level Memory-Aware Platform Description Approach for Embedded MPSoCs. In Proc. of the ACM SIGPLAN/SIGBED 2010 Conference on Languages, Compilers, and Tools for Embedded Systems, pages 9-16 2010[BibTeX][Abstract]@inproceedings { pyka:2010, author = {Pyka, Robert and Klein, Felipe and Marwedel, Peter and Mamagkakis, Stylianos}, title = {Versatile System-Level Memory-Aware Platform Description Approach for Embedded MPSoCs}, booktitle = {Proc. of the ACM SIGPLAN/SIGBED 2010 Conference on Languages, Compilers, and Tools for Embedded Systems}, year = {2010}, pages = {9-16}, publisher = {ACM}, confidential = {n}, abstract = {In this paper, we present a novel system modeling language which targets primarily the development of source-level multiprocessor memory aware optimizations. In contrast to previous system modeling approaches this approach tries to model the whole system and especially the memory hierarchy in a structural and semantically accessible way. Previous approaches primarily support generation of simulators or retargetable code selectors and thus concentrate on pure behavioral models or describe only the processor instruction set in a semantically accessible way, A simple, database-like, interface is offered to the optimization developer, which in conjunction with the MACCv2 framework enables rapid development of source-level architecture independent optimizations.}, }In this paper, we present a novel system modeling language which targets primarily the development of source-level multiprocessor memory aware optimizations. In contrast to previous system modeling approaches this approach tries to model the whole system and especially the memory hierarchy in a structural and semantically accessible way. Previous approaches primarily support generation of simulators or retargetable code selectors and thus concentrate on pure behavioral models or describe only the processor instruction set in a semantically accessible way, A simple, database-like, interface is offered to the optimization developer, which in conjunction with the MACCv2 framework enables rapid development of source-level architecture independent optimizations. Matthias Meier, Michael Engel, Matthias Steinkamp and Olaf Spinczyk.LavA: An Open Platform for Rapid Prototyping of MPSoCs. In Proceedings of the 20th International Conference on Field Programmable Logic and Applications (FPL '10), pages 452--457Milano, Italy, 2010[BibTeX]@inproceedings { meier:10:fpl, author = {Meier, Matthias and Engel, Michael and Steinkamp, Matthias and Spinczyk, Olaf}, title = {LavA: An Open Platform for Rapid Prototyping of MPSoCs}, booktitle = {Proceedings of the 20th International Conference on Field Programmable Logic and Applications (FPL '10)}, year = {2010}, pages = {452--457}, address = {Milano, Italy}, publisher = {IEEE Computer Society Press}, confidential = {n}, } Michael Engel, Felix Jungermann, Katharina Morik and Nico Piatkowski.Enhancing Ubiquitous Systems Through System Call Mining. In Proceedings of the ICDM 2010 Workshop on Large-scale Analytics for Complex Instrumented Systems (LACIS 2010) 2010[BibTeX]@inproceedings { engel:10:lacis, author = {Engel, Michael and Jungermann, Felix and Morik, Katharina and Piatkowski, Nico}, title = {Enhancing Ubiquitous Systems Through System Call Mining}, booktitle = {Proceedings of the ICDM 2010 Workshop on Large-scale Analytics for Complex Instrumented Systems (LACIS 2010)}, year = {2010}, confidential = {n}, } Peter Marwedel and Michael Engel.Plea for a Holistic Analysis of the Relationship between Information Technology and Carbon-Dioxide Emissions. In Workshop on Energy-aware Systems and Methods (GI-ITG)Hanover / Germany, February 2010[BibTeX][PDF][Abstract]@inproceedings { marwedel:10:GI, author = {Marwedel, Peter and Engel, Michael}, title = {Plea for a Holistic Analysis of the Relationship between Information Technology and Carbon-Dioxide Emissions}, booktitle = {Workshop on Energy-aware Systems and Methods (GI-ITG)}, year = {2010}, address = {Hanover / Germany}, month = {feb}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/arcs-10-marwedel.pdf}, confidential = {n}, abstract = {An analysis of the relationship between information technology (IT) and carbon-dioxide (CO2) emissions should not be constrained to an analysis of emissions caused during the operation of IT equipment. Rather, an analysis of emissions should be based on a full life-cycle assessment (LCA) of IT systems, from their conception until their recycling. Also, the reduction of emissions through the use of IT systems should not be forgotten. This paper explains these viewpoints in more detail and provides rough life-cycle analyses of personal computers (PCs). It will be shown that|for standard scenarios|emissions from PC production are exceeding those of their shipment and use. This stresses the importance of using PCs as long as possible.}, }An analysis of the relationship between information technology (IT) and carbon-dioxide (CO2) emissions should not be constrained to an analysis of emissions caused during the operation of IT equipment. Rather, an analysis of emissions should be based on a full life-cycle assessment (LCA) of IT systems, from their conception until their recycling. Also, the reduction of emissions through the use of IT systems should not be forgotten. This paper explains these viewpoints in more detail and provides rough life-cycle analyses of personal computers (PCs). It will be shown that|for standard scenarios|emissions from PC production are exceeding those of their shipment and use. This stresses the importance of using PCs as long as possible. Constantin Timm, Andrej Gelenberg, Peter Marwedel and Frank Weichert.Energy Considerations within the Integration of General Purpose GPUs in Embedded Systems. In Proceedings of the International Conference on Advances in Distributed and Parallel Computing November 2010[BibTeX]@inproceedings { timm:2010:adpc, author = {Timm, Constantin and Gelenberg, Andrej and Marwedel, Peter and Weichert, Frank}, title = {Energy Considerations within the Integration of General Purpose GPUs in Embedded Systems}, booktitle = {Proceedings of the International Conference on Advances in Distributed and Parallel Computing}, year = {2010}, month = {November}, publisher = {Global Science \& Technology Forum}, confidential = {n}, } Daniel Cordes, Peter Marwedel and Arindam Mallik.Automatic Parallelization of Embedded Software Using Hierarchical Task Graphs and Integer Linear Programming. In Proceedings of the eighth IEEE/ACM/IFIP international conference on Hardware/software codesign and system synthesis (CODES+ISSS 2010)Scottsdale / US, October 2010[BibTeX][PDF][Abstract]@inproceedings { cordes:10:CODES, author = {Cordes, Daniel and Marwedel, Peter and Mallik, Arindam}, title = {Automatic Parallelization of Embedded Software Using Hierarchical Task Graphs and Integer Linear Programming}, booktitle = {Proceedings of the eighth IEEE/ACM/IFIP international conference on Hardware/software codesign and system synthesis (CODES+ISSS 2010)}, year = {2010}, address = {Scottsdale / US}, month = {oct}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2010-codes-cordes.pdf}, confidential = {n}, abstract = {The last years have shown that there is no way to disregard the advantages provided by multiprocessor System-on-Chip (MPSoC) architectures in the embedded systems domain. Using multiple cores in a single system enables to close the gap between energy consumption, problems concerning heat dissipation, and computational power. Nevertheless, these benefits do not come for free. New challenges arise, if existing applications have to be ported to these multiprocessor platforms. One of the most ambitious tasks is to extract efficient parallelism from these existing sequential applications. Hence, many parallelization tools have been developed, most of them are extracting as much parallelism as possible, which is in general not the best choice for embedded systems with their limitations in hardware and software support. In contrast to previous approaches, we present a new automatic parallelization tool, tailored to the particular requirements of the resource constrained embedded systems. Therefore, this paper presents an algorithm which automatically steers the granularity of the generated tasks, with respect to architectural requirements and the overall execution time reduction. For this purpose, we exploit hierarchical task graphs to simplify a new integer linear programming based approach in order to split up sequential programs in an efficient way. Results on real-life benchmarks have shown that the presented approach is able to speed sequential applications up by a factor of up to 3.7 on a four core MPSoC architecture.}, }The last years have shown that there is no way to disregard the advantages provided by multiprocessor System-on-Chip (MPSoC) architectures in the embedded systems domain. Using multiple cores in a single system enables to close the gap between energy consumption, problems concerning heat dissipation, and computational power. Nevertheless, these benefits do not come for free. New challenges arise, if existing applications have to be ported to these multiprocessor platforms. One of the most ambitious tasks is to extract efficient parallelism from these existing sequential applications. Hence, many parallelization tools have been developed, most of them are extracting as much parallelism as possible, which is in general not the best choice for embedded systems with their limitations in hardware and software support. In contrast to previous approaches, we present a new automatic parallelization tool, tailored to the particular requirements of the resource constrained embedded systems. Therefore, this paper presents an algorithm which automatically steers the granularity of the generated tasks, with respect to architectural requirements and the overall execution time reduction. For this purpose, we exploit hierarchical task graphs to simplify a new integer linear programming based approach in order to split up sequential programs in an efficient way. Results on real-life benchmarks have shown that the presented approach is able to speed sequential applications up by a factor of up to 3.7 on a four core MPSoC architecture. Peter Marwedel and Michael Engel.Ein Plädoyer für eine holistische Analyse der Zusammenhänge zwischen Informationstechnologie und Kohlendioxyd-Emissionen. In VDE-KongressLeipzig, Germany, November 2010[BibTeX]@inproceedings { marwedel:10:VDE, author = {Marwedel, Peter and Engel, Michael}, title = {Ein Pl{\"a}doyer f{\"u}r eine holistische Analyse der Zusammenh{\"a}nge zwischen Informationstechnologie und Kohlendioxyd-Emissionen}, booktitle = {VDE-Kongress}, year = {2010}, address = {Leipzig, Germany}, month = {nov}, confidential = {n}, } Katharina Morik, Nico Piatkowski, Michael Engel and Felix Jungermann.Enhancing Ubiquitous Systems Through System Call Mining. In Proceedings of the ICDM Workshop on Large-scale Analytics for Complex Instrumented Systems ({LACIS 2010})Sydney, Australia, December 2010[BibTeX]@inproceedings { morik:10:icdm10, author = {Morik, Katharina and Piatkowski, Nico and Engel, Michael and Jungermann, Felix}, title = {Enhancing Ubiquitous Systems Through System Call Mining}, booktitle = {Proceedings of the ICDM Workshop on Large-scale Analytics for Complex Instrumented Systems ({LACIS 2010})}, year = {2010}, address = {Sydney, Australia}, month = {dec}, publisher = {IEEE Computer Society Press}, confidential = {n}, } Sascha Plazar, Peter Marwedel and Jörg Rahnenführer.Optimizing Execution Runtimes of R Programs. In Book of Abstracts of International Symposium on Business and Industrial Statistics (ISBIS), pages 81-82Portoroz (Portorose) / Slovenia, July 2010[BibTeX][PDF]@inproceedings { plazar:10:isbis, author = {Plazar, Sascha and Marwedel, Peter and Rahnenf\ührer, J\örg}, title = {Optimizing Execution Runtimes of R Programs}, booktitle = {Book of Abstracts of International Symposium on Business and Industrial Statistics (ISBIS)}, year = {2010}, pages = {81-82}, address = {Portoroz (Portorose) / Slovenia}, month = {jul}, keywords = {rcs}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2010-isbis.pdf}, confidential = {n}, } Sascha Plazar, Paul Lokuciejewski and Peter Marwedel.WCET-driven Cache-aware Memory Content Selection. In Proceedings of the 13th IEEE International Symposium on Object/Component/Service-oriented Real-time Distributed Computing (ISORC), pages 107-114Carmona / Spain, May 2010[BibTeX][PDF][Abstract]@inproceedings { plazar:10:isorc, author = {Plazar, Sascha and Lokuciejewski, Paul and Marwedel, Peter}, title = {WCET-driven Cache-aware Memory Content Selection}, booktitle = {Proceedings of the 13th IEEE International Symposium on Object/Component/Service-oriented Real-time Distributed Computing (ISORC)}, year = {2010}, pages = {107-114}, address = {Carmona / Spain}, month = {may}, keywords = {wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2010-isorc.pdf}, confidential = {n}, abstract = {Caches are widely used to bridge the increasingly growing gap between processor and memory performance. They store copies of frequently used parts of the slow main memory for faster access. Static analysis techniques allow the estimation of the worst-case cache behavior and enable the computation of an upper bound of the execution time of a program. This bound is called worst-case execution time (WCET). Its knowledge is crucial to verify if hard real-time systems satisfy their timing constraints and the WCET is a key parameter for the design of embedded systems. In this paper, we propose a new WCET-driven cache-aware memory content selection algorithm, which allocates functions whose WCET highly benefits from a cached execution to cached memory areas. Vice versa, rarely used functions which do not benefit from a cached execution are allocated to non-cached memory areas. As a result of this, unfavorable functions w.\,r.\,t. a program's WCET can not evict beneficial functions from the cache. This can lead to a reduced cache miss ratio and a decreased WCET. The effectiveness of our approach is demonstrated by results achieved on real-life benchmarks. In a case study, our greedy algorithm is able to reduce the benchmarks' WCET by up to 20\%.}, }Caches are widely used to bridge the increasingly growing gap between processor and memory performance. They store copies of frequently used parts of the slow main memory for faster access. Static analysis techniques allow the estimation of the worst-case cache behavior and enable the computation of an upper bound of the execution time of a program. This bound is called worst-case execution time (WCET). Its knowledge is crucial to verify if hard real-time systems satisfy their timing constraints and the WCET is a key parameter for the design of embedded systems. In this paper, we propose a new WCET-driven cache-aware memory content selection algorithm, which allocates functions whose WCET highly benefits from a cached execution to cached memory areas. Vice versa, rarely used functions which do not benefit from a cached execution are allocated to non-cached memory areas. As a result of this, unfavorable functions w. r. t. a program's WCET can not evict beneficial functions from the cache. This can lead to a reduced cache miss ratio and a decreased WCET. The effectiveness of our approach is demonstrated by results achieved on real-life benchmarks. In a case study, our greedy algorithm is able to reduce the benchmarks' WCET by up to 20%. Frank Weichert, Marcel Gaspar, Alexander Zybin, Evgeny Gurevich, Alexander Görtz, Constantin Timm, Heinrich Müller and Peter Marwedel.Plasmonen-unterstützte Mikroskopie zur Detektion von Viren. In Bildverarbeitung für die MedizinAachen / Germany, March 2010[BibTeX][Abstract]@inproceedings { weichert:10:bvm, author = {Weichert, Frank and Gaspar, Marcel and Zybin, Alexander and Gurevich, Evgeny and G\"ortz, Alexander and Timm, Constantin and M\"uller, Heinrich and Marwedel, Peter}, title = {Plasmonen-unterst\"utzte Mikroskopie zur Detektion von Viren}, booktitle = {Bildverarbeitung f\"ur die Medizin}, year = {2010}, address = {Aachen / Germany}, month = {March}, confidential = {n}, abstract = {In Anbetracht zunehmend epidemisch auftretender viraler Infektionen ist eine effiziente und ubiquit\"ar verf\"ugbare Methode zur sicheren Virusdetektion hoch relevant. Mit der Plasmonen-unterst\"utzten Mikroskopie steht hierzu eine neuartige Untersuchungsmethode bereit, die aber gro\"se Anforderungen an die Bildverarbeitung zur Differenzierung der Viren innerhalb der Bilddaten stellt. In dieser Arbeit wird hierzu ein erster erfolgversprechender Ansatz vorgestellt. \"Uber bildbasierte Mustererkennung und Zeitreihenanalysen in Kombination mit Klassifikationsverfahren konnte sowohl die Differenzierung von Nanoobjekten als auch die Detektion von Virus-\"ahnlichen Partikeln nachgewiesen werden.}, }In Anbetracht zunehmend epidemisch auftretender viraler Infektionen ist eine effiziente und ubiquitär verfügbare Methode zur sicheren Virusdetektion hoch relevant. Mit der Plasmonen-unterstützten Mikroskopie steht hierzu eine neuartige Untersuchungsmethode bereit, die aber große Anforderungen an die Bildverarbeitung zur Differenzierung der Viren innerhalb der Bilddaten stellt. In dieser Arbeit wird hierzu ein erster erfolgversprechender Ansatz vorgestellt. \"Uber bildbasierte Mustererkennung und Zeitreihenanalysen in Kombination mit Klassifikationsverfahren konnte sowohl die Differenzierung von Nanoobjekten als auch die Detektion von Virus-ähnlichen Partikeln nachgewiesen werden. Andreas Heinig, Michael Engel, Florian Schmoll and Peter Marwedel.Using Application Knowledge to Improve Embedded Systems Dependability. In Proceedings of the Workshop on Hot Topics in System Dependability (HotDep 2010)Vancouver, Canada, October 2010[BibTeX][PDF]@inproceedings { heinig:10:hotdep, author = {Heinig, Andreas and Engel, Michael and Schmoll, Florian and Marwedel, Peter}, title = {Using Application Knowledge to Improve Embedded Systems Dependability}, booktitle = {Proceedings of the Workshop on Hot Topics in System Dependability (HotDep 2010)}, year = {2010}, address = {Vancouver, Canada}, month = {oct}, publisher = {USENIX Association}, keywords = {ders}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2010-heinig-hotdep.pdf}, confidential = {n}, } Jochen Strunk, Andreas Heinig, Toni Volkmer, Wolfgang Rehm and Heiko Schick.ACCFS - Virtual File System Support for Host Coupled Run-Time Reconfigurable FPGAs. In Advances in Parallel Computing, Volume 19, Parallel Computing: From Multicores and GPU's to Petascale, 2010, from Parallel Computing with FPGAs (ParaFPGA) held in conjunction with International Conference on Parallel Computing (ParCo 2009) 2010[BibTeX]@inproceedings { sjoc2010parafpga, author = {Strunk, Jochen and Heinig, Andreas and Volkmer, Toni and Rehm, Wolfgang and Schick, Heiko}, title = {ACCFS - Virtual File System Support for Host Coupled Run-Time Reconfigurable FPGAs}, booktitle = {Advances in Parallel Computing, Volume 19, Parallel Computing: From Multicores and GPU's to Petascale, 2010, from Parallel Computing with FPGAs (ParaFPGA) held in conjunction with International Conference on Parallel Computing (ParCo 2009)}, year = {2010}, publisher = {IOS Press}, confidential = {n}, } Timon Kelter.Superblock-basierte Quellcodeoptimierungen zur WCET-Reduktion. In Workshop ''Echtzeit 2010''Boppard / Germany, November 2010[BibTeX][PDF][Abstract]@inproceedings { kelter:2010:gi-ez, author = {Kelter, Timon}, title = {Superblock-basierte Quellcodeoptimierungen zur WCET-Reduktion}, booktitle = {Workshop ''Echtzeit 2010''}, year = {2010}, address = {Boppard / Germany}, month = {nov}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2010-gi-echtzeit.pdf}, confidential = {n}, abstract = {Das Konzept der \emph{Superbl{\"o}cke} wurde auf dem Gebiet der Compileroptimierungen in der Vergangenheit bereits erfolgreich zur Optimierung der \emph{ACET} (Average Case Execution Time) verwendet. Superbl{\"o}cke sind dabei spezielle Ketten von Basisbl{\"o}cken, die es erleichtern Optimierungen {\"u}ber Basisblockgrenzen anzuwenden und somit ein h{\"o}heres Optimierungspotential zu schaffen. In der vorliegenden Arbeit wurde dieses Konzept zum ersten Mal zur Reduktion der \emph{WCET} (Worst Case Execution Time) von Programmen f{\"u}r eingebettete Systeme ausgenutzt. Die WCET ist im Kontext der eingebetteten Systeme eine wichtige Metrik, da viele eingebettete Systeme unter Echtzeitbedingungen arbeiten m{\"u}ssen und hierzu eine sichere obere Schranke f{\"u}r die Laufzeit eines Programms unabdingbar ist. Die vorgestellte Superblockbildung baut auf einem neuartigen Trace-Selektions-Algorithmus auf, der WCET-Daten auswertet. Au{\"s}erdem wurde das Konzept der Superbl{\"o}cke zum ersten Mal auf der Quellcodeebene angewandt. Auf diese Weise findet die Optimierung fr{\"u}her statt, so da{\"s} eine gr{\"o}{\"s}ere Anzahl nachfolgender Optimierungen von der erzielten Umstrukturierung profitieren kann. Weiterhin wurden die klassischen Optimierungen \emph{Common Subexpression Elimination (CSE)} und \emph{Dead Code Elimination (DCE)} an die Anwendung in Quellcode-Superbl{\"o}cken angepasst. Mit diesem Techniken wurde auf einer Testmenge von 55 bekannten Standard-Benchmarks eine durschnittliche WCET-Reduktion von bis zu 10.2\% erzielt.}, }Das Konzept der Superblöcke wurde auf dem Gebiet der Compileroptimierungen in der Vergangenheit bereits erfolgreich zur Optimierung der ACET (Average Case Execution Time) verwendet. Superblöcke sind dabei spezielle Ketten von Basisblöcken, die es erleichtern Optimierungen über Basisblockgrenzen anzuwenden und somit ein höheres Optimierungspotential zu schaffen. In der vorliegenden Arbeit wurde dieses Konzept zum ersten Mal zur Reduktion der WCET (Worst Case Execution Time) von Programmen für eingebettete Systeme ausgenutzt. Die WCET ist im Kontext der eingebetteten Systeme eine wichtige Metrik, da viele eingebettete Systeme unter Echtzeitbedingungen arbeiten müssen und hierzu eine sichere obere Schranke für die Laufzeit eines Programms unabdingbar ist. Die vorgestellte Superblockbildung baut auf einem neuartigen Trace-Selektions-Algorithmus auf, der WCET-Daten auswertet. Außerdem wurde das Konzept der Superblöcke zum ersten Mal auf der Quellcodeebene angewandt. Auf diese Weise findet die Optimierung früher statt, so daß eine größere Anzahl nachfolgender Optimierungen von der erzielten Umstrukturierung profitieren kann. Weiterhin wurden die klassischen Optimierungen Common Subexpression Elimination (CSE) und Dead Code Elimination (DCE) an die Anwendung in Quellcode-Superblöcken angepasst. Mit diesem Techniken wurde auf einer Testmenge von 55 bekannten Standard-Benchmarks eine durschnittliche WCET-Reduktion von bis zu 10.2% erzielt. Christos Baloukas, Lazaros Papadopoulos, Dimitrios Soudris, Sander Stuijk, Olivera Jovanovic, Florian Schmoll, Daniel Cordes, Robert Pyka, Arindam Mallik, Stylianos Mamagkakis, François Capman, Séverin Collet, Nikolaos Mitas and Dimitrios Kritharidis.Mapping Embedded Applications on MPSoCs: The MNEMEE Approach. In Proceedings of the 2010 IEEE Annual Symposium on VLSI, pages 512-517Washington, DC, USA, September 2010[BibTeX][PDF][Abstract]@inproceedings { baloukas:10:isvlsi, author = {Baloukas, Christos and Papadopoulos, Lazaros and Soudris, Dimitrios and Stuijk, Sander and Jovanovic, Olivera and Schmoll, Florian and Cordes, Daniel and Pyka, Robert and Mallik, Arindam and Mamagkakis, Stylianos and Capman, Fran\c{c}ois and Collet, S\'{e}verin and Mitas, Nikolaos and Kritharidis, Dimitrios}, title = {Mapping Embedded Applications on MPSoCs: The MNEMEE Approach}, booktitle = {Proceedings of the 2010 IEEE Annual Symposium on VLSI}, year = {2010}, series = {ISVLSI '10}, pages = {512-517}, address = {Washington, DC, USA}, month = {sep}, publisher = {IEEE Computer Society}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2010-isvlsi.pdf}, confidential = {n}, abstract = {As embedded systems are becoming the center of our digital life, system design becomes progressively harder. The integration of multiple features on devices with limited resources requires careful and exhaustive exploration of the design search space in order to efficiently map modern applications to an embedded multi-processor platform. The MNEMEE project addresses this challenge by offering a unique integrated tool flow that performs source-to-source transformations to automatically optimize the original source code and map it on the target platform. The optimizations aim at reducing the number of memory accesses and the required memory storage of both dynamically and statically allocated data. Furthermore, the MNEMEE tool flow performs optimal assignment of all data on the memory hierarchy of the target platform. Designers can use the whole flow or a part of it and integrate it into their own design flow. This paper gives an overview of the MNEMEE tool flow along. It also presents two industrial case studies that demonstrate who the techniques and tools developed in the MNEMEE project can be integrated into industrial design flows.}, }As embedded systems are becoming the center of our digital life, system design becomes progressively harder. The integration of multiple features on devices with limited resources requires careful and exhaustive exploration of the design search space in order to efficiently map modern applications to an embedded multi-processor platform. The MNEMEE project addresses this challenge by offering a unique integrated tool flow that performs source-to-source transformations to automatically optimize the original source code and map it on the target platform. The optimizations aim at reducing the number of memory accesses and the required memory storage of both dynamically and statically allocated data. Furthermore, the MNEMEE tool flow performs optimal assignment of all data on the memory hierarchy of the target platform. Designers can use the whole flow or a part of it and integrate it into their own design flow. This paper gives an overview of the MNEMEE tool flow along. It also presents two industrial case studies that demonstrate who the techniques and tools developed in the MNEMEE project can be integrated into industrial design flows. Andreas Heinig, Michael Engel, Florian Schmoll and Peter Marwedel.Improving Transient Memory Fault Resilience of an H.264 Decoder. In Proceedings of the Workshop on Embedded Systems for Real-time Multimedia (ESTIMedia 2010)Scottsdale, AZ, USA, October 2010[BibTeX][PDF]@inproceedings { heinig:10:estimedia, author = {Heinig, Andreas and Engel, Michael and Schmoll, Florian and Marwedel, Peter}, title = {Improving Transient Memory Fault Resilience of an H.264 Decoder}, booktitle = {Proceedings of the Workshop on Embedded Systems for Real-time Multimedia (ESTIMedia 2010)}, year = {2010}, address = {Scottsdale, AZ, USA}, month = {oct}, publisher = {IEEE Computer Society Press}, keywords = {ders}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2010-heinig-estimedia.pdf}, confidential = {n}, } Paul Lokuciejewski, Timon Kelter and Peter Marwedel.Superblock-Based Source Code Optimizations for WCET Reduction. In Proceedings of the 7th International Conference on Embedded Software and Systems (ICESS), pages 1918-1925Bradford / UK, June 2010[BibTeX][PDF][Abstract]@inproceedings { lokuciejewski:10:icess, author = {Lokuciejewski, Paul and Kelter, Timon and Marwedel, Peter}, title = {Superblock-Based Source Code Optimizations for WCET Reduction}, booktitle = {Proceedings of the 7th International Conference on Embedded Software and Systems (ICESS)}, year = {2010}, pages = {1918-1925}, address = {Bradford / UK}, month = {jun}, keywords = {wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2010-icess.pdf}, confidential = {n}, abstract = {Superblocks represent regions in a program code that consist of multiple basic blocks. Compilers benefit from this structure since it enables optimization across block boundaries. This increased optimization potential was thoroughly studied in the past for average-case execution time (ACET) reduction at assembly level. In this paper, the concept of superblocks is exploited for the optimization of embedded real-time systems that have to meet stringent timing constraints specified by the worst-case execution time (WCET). To achieve this goal, our superblock formation is based on a novel trace selection algorithm which is driven by WCET data. Moreover, we translate superblocks for the first time from assembly to source code level. This approach enables an early code restructuring in the optimizer, providing more optimization opportunities for both subsequent source code and assembly level transformations. An adaption of the traditional optimizations common subexpression and dead code elimination to our WCET-aware superblocks allows an effective WCET reduction. Using our techniques, we significantly outperform standard optimizations and achieve an average WCET reduction of up to 10.2\% for a total of 55 real-life benchmarks.}, }Superblocks represent regions in a program code that consist of multiple basic blocks. Compilers benefit from this structure since it enables optimization across block boundaries. This increased optimization potential was thoroughly studied in the past for average-case execution time (ACET) reduction at assembly level. In this paper, the concept of superblocks is exploited for the optimization of embedded real-time systems that have to meet stringent timing constraints specified by the worst-case execution time (WCET). To achieve this goal, our superblock formation is based on a novel trace selection algorithm which is driven by WCET data. Moreover, we translate superblocks for the first time from assembly to source code level. This approach enables an early code restructuring in the optimizer, providing more optimization opportunities for both subsequent source code and assembly level transformations. An adaption of the traditional optimizations common subexpression and dead code elimination to our WCET-aware superblocks allows an effective WCET reduction. Using our techniques, we significantly outperform standard optimizations and achieve an average WCET reduction of up to 10.2% for a total of 55 real-life benchmarks. Paul Lokuciejewski, Sascha Plazar, Heiko Falk, Peter Marwedel and Lothar Thiele.Multi-Objective Exploration of Compiler Optimizations for Real-Time Systems. In Proceedings of the 13th International Symposium on Object/Component/Service-oriented Real-time Distributed Computing (ISORC), pages 115-122Carmona / Spain, May 2010[BibTeX][PDF][Abstract]@inproceedings { lokuciejewski:10:isorc, author = {Lokuciejewski, Paul and Plazar, Sascha and Falk, Heiko and Marwedel, Peter and Thiele, Lothar}, title = {Multi-Objective Exploration of Compiler Optimizations for Real-Time Systems}, booktitle = {Proceedings of the 13th International Symposium on Object/Component/Service-oriented Real-time Distributed Computing (ISORC)}, year = {2010}, pages = {115-122}, address = {Carmona / Spain}, month = {may}, keywords = {wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2010-isorc_2.pdf}, confidential = {n}, abstract = {With the growing complexity of embedded systems software, high code quality can only be achieved using a compiler. Sophisticated compilers provide a vast spectrum of various optimizations to improve code aggressively w.r.t. different objective functions, e.g., average-case execution time \textit{(ACET)} or code size. Due to the complex interactions between the optimizations, the choice for a promising sequence of code transformations is not trivial. Compiler developers address this problem by proposing standard optimization levels, e.g., \textit{O3} or \textit{Os}. However, previous studies have shown that these standard levels often miss optimization potential or might even result in performance degradation. In this paper, we propose the first adaptive WCET-aware compiler framework for an automatic search of compiler optimization sequences which yield highly optimized code. Besides the objective functions ACET and code size, we consider the worst-case execution time \textit{(WCET)} which is a crucial parameter for real-time systems. To find suitable trade-offs between these objectives, stochastic evolutionary multi-objective algorithms identifying Pareto optimal solutions are exploited. A comparison based on statistical performance assessments is performed which helps to determine the most suitable multi-objective optimizer. The effectiveness of our approach is demonstrated on real-life benchmarks showing that standard optimization levels can be significantly outperformed.}, }With the growing complexity of embedded systems software, high code quality can only be achieved using a compiler. Sophisticated compilers provide a vast spectrum of various optimizations to improve code aggressively w.r.t. different objective functions, e.g., average-case execution time (ACET) or code size. Due to the complex interactions between the optimizations, the choice for a promising sequence of code transformations is not trivial. Compiler developers address this problem by proposing standard optimization levels, e.g., O3 or Os. However, previous studies have shown that these standard levels often miss optimization potential or might even result in performance degradation. In this paper, we propose the first adaptive WCET-aware compiler framework for an automatic search of compiler optimization sequences which yield highly optimized code. Besides the objective functions ACET and code size, we consider the worst-case execution time (WCET) which is a crucial parameter for real-time systems. To find suitable trade-offs between these objectives, stochastic evolutionary multi-objective algorithms identifying Pareto optimal solutions are exploited. A comparison based on statistical performance assessments is performed which helps to determine the most suitable multi-objective optimizer. The effectiveness of our approach is demonstrated on real-life benchmarks showing that standard optimization levels can be significantly outperformed. Paul Lokuciejewski, Marco Stolpe, Katharina Morik and Peter Marwedel.Automatic Selection of Machine Learning Models for WCET-aware Compiler Heuristic Generation. In Proceedings of the 4th Workshop on Statistical and Machine Learning Approaches to Architectures and Compilation (SMART), pages 3-17Pisa / Italy, January 2010[BibTeX][PDF][Abstract]@inproceedings { lokuciejewski:10:smart, author = {Lokuciejewski, Paul and Stolpe, Marco and Morik, Katharina and Marwedel, Peter}, title = {Automatic Selection of Machine Learning Models for WCET-aware Compiler Heuristic Generation}, booktitle = {Proceedings of the 4th Workshop on Statistical and Machine Learning Approaches to Architectures and Compilation (SMART)}, year = {2010}, pages = {3-17}, address = {Pisa / Italy}, month = {jan}, keywords = {wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2010-smart.pdf}, confidential = {n}, abstract = {Machine learning has shown its capabilities for an automatic generation of heuristics used by optimizing compilers. The advantages of these heuristics are that they can be easily adopted to a new environment and in some cases outperform hand-crafted compiler optimizations. However, this approach shifts the effort from manual heuristic tuning to the model selection problem of machine learning - i.e., selecting learning algorithms and their respective parameters - which is a tedious task in its own right. In this paper, we tackle the model selection problem in a systematic way. As our experiments show, the right choice of a learning algorithm and its parameters can significantly affect the quality of the generated heuristics. We present a generic framework integrating machine learning into a compiler to enable an automatic search for the best learning algorithm. To find good settings for the learner parameters within the large search space, optimizations based on evolutionary algorithms are applied. In contrast to the majority of other approaches aiming at a reduction of the average-case execution time (ACET), our goal is the minimization of the worst-case execution time (WCET) which is a key parameter for embedded systems acting as real-time systems. A careful case study on the heuristic generation for the well-known optimization loop invariant code motion shows the challenges and benefits of our methods.}, }Machine learning has shown its capabilities for an automatic generation of heuristics used by optimizing compilers. The advantages of these heuristics are that they can be easily adopted to a new environment and in some cases outperform hand-crafted compiler optimizations. However, this approach shifts the effort from manual heuristic tuning to the model selection problem of machine learning - i.e., selecting learning algorithms and their respective parameters - which is a tedious task in its own right. In this paper, we tackle the model selection problem in a systematic way. As our experiments show, the right choice of a learning algorithm and its parameters can significantly affect the quality of the generated heuristics. We present a generic framework integrating machine learning into a compiler to enable an automatic search for the best learning algorithm. To find good settings for the learner parameters within the large search space, optimizations based on evolutionary algorithms are applied. In contrast to the majority of other approaches aiming at a reduction of the average-case execution time (ACET), our goal is the minimization of the worst-case execution time (WCET) which is a key parameter for embedded systems acting as real-time systems. A careful case study on the heuristic generation for the well-known optimization loop invariant code motion shows the challenges and benefits of our methods. Michael Engel, Hans P. Reiser, Olaf Spinczyk, Rüdiger Kapitza and Jörg Nolte.Proceedings of the Workshop on Isolation and Integration for Dependable Systems (IIDS 2010).Paris, France, April 2010[BibTeX]@inproceedings { engel:10:eurosys-iids-proc, author = {Engel, Michael and Reiser, Hans P. and Spinczyk, Olaf and Kapitza, R{\"u}diger and Nolte, J{\"o}rg}, title = {Proceedings of the Workshop on Isolation and Integration for Dependable Systems (IIDS 2010)}, year = {2010}, address = {Paris, France}, month = {apr}, publisher = {ACM Press}, confidential = {n}, } Constantin Timm, Jens Schmutzler, Peter Marwedel and Christian Wietfeld.Dynamic Web Service Orchestration applied to the Device Profile for Web Services in Hierarchical Networks. In COMSWARE '09: Proceedings of the Fourth International ICST Conference on COMmunication System softWAre and middlewaRE, pages 1 - 6Dublin, Ireland, 06 2009[BibTeX][Abstract]@inproceedings { 2009Timm, author = {Timm, Constantin and Schmutzler, Jens and Marwedel, Peter and Wietfeld, Christian}, title = {Dynamic Web Service Orchestration applied to the Device Profile for Web Services in Hierarchical Networks}, booktitle = {COMSWARE '09: Proceedings of the Fourth International ICST Conference on COMmunication System softWAre and middlewaRE}, year = {2009}, pages = {1 - 6}, address = {Dublin, Ireland}, month = {06}, confidential = {n}, abstract = {Based on the idea of Service Oriented Architectures (SOA), Web Services paved the way for open and flexible interac- tion between heterogeneous systems with a loose coupling between service endpoints. The Device Profile for Web Ser- vices (DPWS) implements a subset of WS-* specifications in order to make the advantages of the Web Service archi- tecture available to a growing embedded systems market. In this paper we are proposing a service orchestration mecha- nism applied to services on top of a DPWS-based middle- ware. The approach is complementary to the rather complex and resource intensive Web Service Business Process Execu- tion Language (WS-BPEL) and focuses on service orchestra- tion on resource constrained devices deployed in hierarchi- cal network topologies. We validate our service orchestra- tion concept through its resource consumption and illustrate its seamless integration into the service development cycle based on the underlying DPWS-compliant middleware.}, }Based on the idea of Service Oriented Architectures (SOA), Web Services paved the way for open and flexible interac- tion between heterogeneous systems with a loose coupling between service endpoints. The Device Profile for Web Ser- vices (DPWS) implements a subset of WS-* specifications in order to make the advantages of the Web Service archi- tecture available to a growing embedded systems market. In this paper we are proposing a service orchestration mecha- nism applied to services on top of a DPWS-based middle- ware. The approach is complementary to the rather complex and resource intensive Web Service Business Process Execu- tion Language (WS-BPEL) and focuses on service orchestra- tion on resource constrained devices deployed in hierarchi- cal network topologies. We validate our service orchestra- tion concept through its resource consumption and illustrate its seamless integration into the service development cycle based on the underlying DPWS-compliant middleware. Heiko Falk.WCET-aware Register Allocation based on Graph Coloring. In The 46th Design Automation Conference (DAC), pages 726-731San Francisco / USA, July 2009[BibTeX][PDF][Abstract]@inproceedings { falk:09:dac1, author = {Falk, Heiko}, title = {WCET-aware Register Allocation based on Graph Coloring}, booktitle = {The 46th Design Automation Conference (DAC)}, year = {2009}, pages = {726-731}, address = {San Francisco / USA}, month = {jul}, keywords = {wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2009-dac_1.pdf}, confidential = {n}, abstract = {Current compilers lack precise timing models guiding their built-in optimizations. Hence, compilers apply ad-hoc heuristics during optimization to improve code quality. One of the most important optimizations is register allocation. Many compilers heuristically decide when and where to spill a register to memory, without having a clear understanding of the impact of such spill code on a program's run time. This paper extends a graph coloring register allocator such that it uses precise worst-case execution time \textit{(WCET)} models. Using this WCET timing data, the compiler tries to avoid spill code generation along the critical path defining a program's WCET. To the best of our knowledge, this paper is the first one to present a WCET-aware register allocator. Our results underline the effectiveness of the proposed techniques. For a total of 46 realistic benchmarks, we reduced WCETs by 31.2\% on average. Additionally, the runtimes of our WCET-aware register allocator still remain acceptable.}, }Current compilers lack precise timing models guiding their built-in optimizations. Hence, compilers apply ad-hoc heuristics during optimization to improve code quality. One of the most important optimizations is register allocation. Many compilers heuristically decide when and where to spill a register to memory, without having a clear understanding of the impact of such spill code on a program's run time. This paper extends a graph coloring register allocator such that it uses precise worst-case execution time (WCET) models. Using this WCET timing data, the compiler tries to avoid spill code generation along the critical path defining a program's WCET. To the best of our knowledge, this paper is the first one to present a WCET-aware register allocator. Our results underline the effectiveness of the proposed techniques. For a total of 46 realistic benchmarks, we reduced WCETs by 31.2% on average. Additionally, the runtimes of our WCET-aware register allocator still remain acceptable. Heiko Falk and Jan C. Kleinsorge.Optimal Static WCET-aware Scratchpad Allocation of Program Code. In The 46th Design Automation Conference (DAC), pages 732-737San Francisco / USA, July 2009[BibTeX][PDF][Abstract]@inproceedings { falk:09:dac2, author = {Falk, Heiko and Kleinsorge, Jan C.}, title = {Optimal Static WCET-aware Scratchpad Allocation of Program Code}, booktitle = {The 46th Design Automation Conference (DAC)}, year = {2009}, pages = {732-737}, address = {San Francisco / USA}, month = {jul}, keywords = {wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2009-dac_2.pdf}, confidential = {n}, abstract = {Caches are notorious for their unpredictability. It is difficult or even impossible to predict if a memory access will result in a definite cache hit or miss. This unpredictability is highly undesired especially when designing real-time systems where the \textit{worst-case execution time (WCET)} is one of the key metrics. \textit{Scratchpad memories (SPMs)} have proven to be a fully predictable alternative to caches. In contrast to caches, however, SPMs require dedicated compiler support. This paper presents an optimal static SPM allocation algorithm for program code. It minimizes WCETs by placing the most beneficial parts of a program's code in an SPM. Our results underline the effectiveness of the proposed techniques. For a total of 73 realistic benchmarks, we reduced WCETs on average by 7.4\% up to 40\%. Additionally, the run times of our ILP-based SPM allocator are negligible.}, }Caches are notorious for their unpredictability. It is difficult or even impossible to predict if a memory access will result in a definite cache hit or miss. This unpredictability is highly undesired especially when designing real-time systems where the worst-case execution time (WCET) is one of the key metrics. Scratchpad memories (SPMs) have proven to be a fully predictable alternative to caches. In contrast to caches, however, SPMs require dedicated compiler support. This paper presents an optimal static SPM allocation algorithm for program code. It minimizes WCETs by placing the most beneficial parts of a program's code in an SPM. Our results underline the effectiveness of the proposed techniques. For a total of 73 realistic benchmarks, we reduced WCETs on average by 7.4% up to 40%. Additionally, the run times of our ILP-based SPM allocator are negligible. Andreas Heinig, Jochen Strunk, Wolfgang Rehm and Heiko Schick.ACCFS - Operating System Integration of Computational Accelerators Using a VFS Approach. In Proceedings of Applied Reconfigurable Computing (ARC) 2009[BibTeX]@inproceedings { Heinig2009arc, author = {Heinig, Andreas and Strunk, Jochen and Rehm, Wolfgang and Schick, Heiko}, title = {ACCFS - Operating System Integration of Computational Accelerators Using a VFS Approach}, booktitle = {Proceedings of Applied Reconfigurable Computing (ARC)}, year = {2009}, publisher = {LNCS}, confidential = {n}, } Jochen Strunk, Andreas Heinig, Toni Volkmer, Wolfgang Rehm and Heiko Schick.Run-Time Reconfiguration for HyperTransport coupled FPGAs using ACCFS. In Proceedings of First International Workshop on HyperTransport Research and Applications 2009[BibTeX]@inproceedings { sjoc2009whtra, author = {Strunk, Jochen and Heinig, Andreas and Volkmer, Toni and Rehm, Wolfgang and Schick, Heiko}, title = {Run-Time Reconfiguration for HyperTransport coupled FPGAs using ACCFS}, booktitle = {Proceedings of First International Workshop on HyperTransport Research and Applications}, year = {2009}, publisher = {WHTRA}, confidential = {n}, } Michael Engel and Olaf Spinczyk.A Radical Approach to Network-on-Chip Operating Systems. In Proceedings of the 42nd Hawai'i International Conference on System Sciences (HICSS '09)Waikoloa, Big Island, Hawaii, January 2009[BibTeX]@inproceedings { engel:09:hicss, author = {Engel, Michael and Spinczyk, Olaf}, title = {A Radical Approach to Network-on-Chip Operating Systems}, booktitle = {Proceedings of the 42nd Hawai'i International Conference on System Sciences (HICSS '09)}, year = {2009}, address = {Waikoloa, Big Island, Hawaii}, month = {jan}, publisher = {IEEE Computer Society Press}, confidential = {n}, } Sascha Plazar, Paul Lokuciejewski and Peter Marwedel.WCET-aware Software Based Cache Partitioning for Multi-Task Real-Time Systems. In The 9th International Workshop on Worst-Case Execution Time Analysis (WCET), pages 78-88Dublin / Ireland, June 2009[BibTeX][PDF][Abstract]@inproceedings { plazar:09:wcet, author = {Plazar, Sascha and Lokuciejewski, Paul and Marwedel, Peter}, title = {WCET-aware Software Based Cache Partitioning for Multi-Task Real-Time Systems}, booktitle = {The 9th International Workshop on Worst-Case Execution Time Analysis (WCET)}, year = {2009}, pages = {78-88}, address = {Dublin / Ireland}, month = {jun}, keywords = {wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2009-wcet.pdf}, confidential = {n}, abstract = {Caches are a source of unpredictability since it is very difficult to predict if a memory access results in a cache hit or miss. In systems running multiple tasks steered by a preempting scheduler, it is even impossible to determine the cache behavior since interrupt-driven schedulers lead to unknown points of time for context switches. Partitioned caches are already used in multi-task environments to increase the cache hit ratio by avoiding mutual eviction of tasks from the cache. For real-time systems, the upper bound of the execution time is one of the most important metrics, called the Worst-Case Execution Time (WCET). In this paper, we use partitioning of instruction caches as a technique to achieve tighter WCET estimations since tasks can not be evicted from their partition by other tasks. We propose a novel WCET-aware algorithm, which determines the optimal partition size for each task with focus on decreasing the system's WCET for a given set of possible partition sizes. Employing this algorithm, we are able to decrease the WCET depending on the number of tasks in a set by up to 34\%. On average, reductions between 12\% and 19\% can be achieved.}, }Caches are a source of unpredictability since it is very difficult to predict if a memory access results in a cache hit or miss. In systems running multiple tasks steered by a preempting scheduler, it is even impossible to determine the cache behavior since interrupt-driven schedulers lead to unknown points of time for context switches. Partitioned caches are already used in multi-task environments to increase the cache hit ratio by avoiding mutual eviction of tasks from the cache. For real-time systems, the upper bound of the execution time is one of the most important metrics, called the Worst-Case Execution Time (WCET). In this paper, we use partitioning of instruction caches as a technique to achieve tighter WCET estimations since tasks can not be evicted from their partition by other tasks. We propose a novel WCET-aware algorithm, which determines the optimal partition size for each task with focus on decreasing the system's WCET for a given set of possible partition sizes. Employing this algorithm, we are able to decrease the WCET depending on the number of tasks in a set by up to 34%. On average, reductions between 12% and 19% can be achieved. Daniel Dressler, Martin Groß, Jan-Philipp Kappmeier, Timon Kelter, Daniel Plümpe, Melanie Schmidt, Martin Skutella and Sylvie Temme.On the Use of Network Flow Techniques for Assigning Evacuees to Exits. In The First International Conference on Evacuation Modeling (ICEM)Delft / The Netherlands, September 2009[BibTeX][PDF][Abstract]@inproceedings { dressler:09:icem, author = {Dressler, Daniel and Gro\ss, Martin and Kappmeier, Jan-Philipp and Kelter, Timon and Pl\"umpe, Daniel and Schmidt, Melanie and Skutella, Martin and Temme, Sylvie}, title = {On the Use of Network Flow Techniques for Assigning Evacuees to Exits}, booktitle = {The First International Conference on Evacuation Modeling (ICEM)}, year = {2009}, address = {Delft / The Netherlands}, month = {sep}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2009-icem.pdf}, confidential = {n}, abstract = {We apply network flow techniques to find good exit selections for evacuees in an emergency evacuation. More precisely, we present two algorithms for computing exit distributions using both classical flows and flows over time which are well known from combinatorial optimization. The performance of these new proposals is compared to a simple shortest path approach and to a best response dynamics approach by using a cellular automaton model.}, }We apply network flow techniques to find good exit selections for evacuees in an emergency evacuation. More precisely, we present two algorithms for computing exit distributions using both classical flows and flows over time which are well known from combinatorial optimization. The performance of these new proposals is compared to a simple shortest path approach and to a best response dynamics approach by using a cellular automaton model. G. Schuenemann, P. Hartmann, D. Schirmer, P. Towalski, T. Weis, K. Wille and P. Marwedel.An FPGA Based Data Acquisition System for a fast Orbit Feedback at DELTA. In 9th European Workshop on Beam Diagnostics and Instrumentation for Particle AcceleratorsBasel / Switzerland, May 2009[BibTeX][PDF][Abstract]@inproceedings { marwedel:09:dipac, author = {Schuenemann, G. and Hartmann, P. and Schirmer, D. and Towalski, P. and Weis, T. and Wille, K. and Marwedel, P.}, title = {An FPGA Based Data Acquisition System for a fast Orbit Feedback at DELTA}, booktitle = {9th European Workshop on Beam Diagnostics and Instrumentation for Particle Accelerators}, year = {2009}, address = {Basel / Switzerland}, month = {may}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2009-dipac.pdf}, confidential = {n}, abstract = {The demand for beam orbit stability for frequencies up to 1kHz resulted in the need for a fast orbit position data acquisition system at DELTA. The measurement frequency was decided to be 10kHz which results in a good margin for 1kHz corrections. It is based on a Xilinx University Program Virtex-II Pro Development System in conjunction with an inhouse developed Analog-Digital Converter board, featuring two Analog Devices AD974 chips. An inhouse developed software written in VHDL manages measurement and data pre-processing. A communication controller has been adopted from the Diamond Light Source and is used as communication instance. The communication controller is versatile in its application. The data distribution between two or more of the developed measuring systems is possible. This includes data distribution with other systems utilizing the communication controller, e.g. the Libera beam diagnostic system1. To enhance its measuring capabilities one of the two onboard PowerPC cores is running a Linux kernel. A kernel module, capable of receiving the measurement data from the Field Programmable Gateway Array (FPGA) measurement core, was implemented , allowing for advanced data processing and distribution options. The paper presents the design of the system, the used methods and successful results of the first beam measurements.}, }The demand for beam orbit stability for frequencies up to 1kHz resulted in the need for a fast orbit position data acquisition system at DELTA. The measurement frequency was decided to be 10kHz which results in a good margin for 1kHz corrections. It is based on a Xilinx University Program Virtex-II Pro Development System in conjunction with an inhouse developed Analog-Digital Converter board, featuring two Analog Devices AD974 chips. An inhouse developed software written in VHDL manages measurement and data pre-processing. A communication controller has been adopted from the Diamond Light Source and is used as communication instance. The communication controller is versatile in its application. The data distribution between two or more of the developed measuring systems is possible. This includes data distribution with other systems utilizing the communication controller, e.g. the Libera beam diagnostic system1. To enhance its measuring capabilities one of the two onboard PowerPC cores is running a Linux kernel. A kernel module, capable of receiving the measurement data from the Field Programmable Gateway Array (FPGA) measurement core, was implemented , allowing for advanced data processing and distribution options. The paper presents the design of the system, the used methods and successful results of the first beam measurements. Paul Lokuciejewski, Daniel Cordes, Heiko Falk and Peter Marwedel.A Fast and Precise Static Loop Analysis based on Abstract Interpretation, Program Slicing and Polytope Models. In International Symposium on Code Generation and Optimization (CGO), pages 136-146Seattle / USA, March 2009[BibTeX][PDF][Abstract]@inproceedings { lokuciejewski:09:cgo, author = {Lokuciejewski, Paul and Cordes, Daniel and Falk, Heiko and Marwedel, Peter}, title = {A Fast and Precise Static Loop Analysis based on Abstract Interpretation, Program Slicing and Polytope Models}, booktitle = {International Symposium on Code Generation and Optimization (CGO)}, year = {2009}, pages = {136-146}, address = {Seattle / USA}, month = {mar}, keywords = {wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2009-cgo.pdf}, confidential = {n}, abstract = {A static loop analysis is a program analysis computing loop iteration counts. This information is crucial for different fields of applications. In the domain of compilers, the knowledge about loop iterations can be exploited for aggressive loop optimizations like Loop Unrolling. A loop analyzer also provides static information about code execution frequencies which can assist feedback-directed optimizations. Another prominent application is the static worst-case execution time (WCET) analysis which relies on a safe approximation of loop iteration counts. In this paper, we propose a framework for a static loop analysis based on Abstract Interpretation, a theory of a sound approximation of program semantics. To accelerate the analysis, we preprocess the analyzed code using Program Slicing, a technique that removes statements irrelevant for the loop analysis. In addition, we introduce a novel polytope-based loop evaluation that further significantly reduces the analysis time. The efficiency of our loop analyzer is evaluated on a large number of benchmarks. Results show that 99\% of the considered loops could be successfully analyzed in an acceptable amount of time. This study points out that our methodology is best suited for real-world problems.}, }A static loop analysis is a program analysis computing loop iteration counts. This information is crucial for different fields of applications. In the domain of compilers, the knowledge about loop iterations can be exploited for aggressive loop optimizations like Loop Unrolling. A loop analyzer also provides static information about code execution frequencies which can assist feedback-directed optimizations. Another prominent application is the static worst-case execution time (WCET) analysis which relies on a safe approximation of loop iteration counts. In this paper, we propose a framework for a static loop analysis based on Abstract Interpretation, a theory of a sound approximation of program semantics. To accelerate the analysis, we preprocess the analyzed code using Program Slicing, a technique that removes statements irrelevant for the loop analysis. In addition, we introduce a novel polytope-based loop evaluation that further significantly reduces the analysis time. The efficiency of our loop analyzer is evaluated on a large number of benchmarks. Results show that 99% of the considered loops could be successfully analyzed in an acceptable amount of time. This study points out that our methodology is best suited for real-world problems. Paul Lokuciejewski and Peter Marwedel.Combining Worst-Case Timing Models, Loop Unrolling, and Static Loop Analysis for WCET Minimization. In The 21st Euromicro Conference on Real-Time Systems (ECRTS), pages 35-44Dublin / Ireland, July 2009[BibTeX][PDF][Abstract]@inproceedings { lokuciejewski:09:ecrts, author = {Lokuciejewski, Paul and Marwedel, Peter}, title = {Combining Worst-Case Timing Models, Loop Unrolling, and Static Loop Analysis for WCET Minimization}, booktitle = {The 21st Euromicro Conference on Real-Time Systems (ECRTS)}, year = {2009}, pages = {35-44}, address = {Dublin / Ireland}, month = {jul}, keywords = {wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2009-ecrts.pdf}, confidential = {n}, abstract = {Program loops are notorious for their optimization potential on modern high-performance architectures. Compilers aim at their aggressive transformation to achieve large improvements of the program performance. In particular, the optimization loop unrolling has shown in the past decades to be highly effective achieving significant increases of the average-case performance. In this paper, we present loop unrolling that is tailored towards real-time systems. Our novel optimization is driven by worst-case execution time (WCET) information to effectively minimize the program's worst-case behavior. To exploit maximal optimization potential, the determination of a suitable unrolling factor is based on precise loop iteration counts provided by a static loop analysis. In addition, our heuristics avoid adverse effects of unrolling which result from instruction cache overflows and the generation of additional spill code. Results on 45 real-life benchmarks demonstrate that aggressive loop unrolling can yield WCET reductions of up to 13.7\% over simple, naive approaches employed by many production compilers.}, }Program loops are notorious for their optimization potential on modern high-performance architectures. Compilers aim at their aggressive transformation to achieve large improvements of the program performance. In particular, the optimization loop unrolling has shown in the past decades to be highly effective achieving significant increases of the average-case performance. In this paper, we present loop unrolling that is tailored towards real-time systems. Our novel optimization is driven by worst-case execution time (WCET) information to effectively minimize the program's worst-case behavior. To exploit maximal optimization potential, the determination of a suitable unrolling factor is based on precise loop iteration counts provided by a static loop analysis. In addition, our heuristics avoid adverse effects of unrolling which result from instruction cache overflows and the generation of additional spill code. Results on 45 real-life benchmarks demonstrate that aggressive loop unrolling can yield WCET reductions of up to 13.7% over simple, naive approaches employed by many production compilers. Paul Lokuciejewski, Fatih Gedikli, Peter Marwedel and Katharina Morik.Automatic WCET Reduction by Machine Learning Based Heuristics for Function Inlining. In Proceedings of the 3rd Workshop on Statistical and Machine Learning Approaches to Architectures and Compilation (SMART), pages 1-15Paphos / Cyprus, January 2009[BibTeX][PDF][Abstract]@inproceedings { lokuciejewski:09:smart, author = {Lokuciejewski, Paul and Gedikli, Fatih and Marwedel, Peter and Morik, Katharina}, title = {Automatic WCET Reduction by Machine Learning Based Heuristics for Function Inlining}, booktitle = {Proceedings of the 3rd Workshop on Statistical and Machine Learning Approaches to Architectures and Compilation (SMART)}, year = {2009}, pages = {1-15}, address = {Paphos / Cyprus}, month = {jan}, keywords = {wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2009-smart.pdf}, confidential = {n}, abstract = {The application of machine learning techniques in compiler frameworks has become a challenging research area. Learning algorithms are exploited for an automatic generation of optimization heuristics which often outperform hand-crafted models. Moreover, these automatic approaches can effectively tune the compilers' heuristics after larger changes in the optimization sequence or they can be leveraged to tailor heuristics towards a particular architectural model. Previous works focussed on a reduction of the average-case performance. In this paper, learning approaches are studied in the context of an automatic minimization of the worst-case execution time (WCET) which is the upper bound of the program's maximum execution time. We show that explicitly taking the new timing model into account allows the construction of compiler heuristics that effectively reduce the WCET. This is demonstrated for the well-known optimization function inlining. Our WCET-driven inlining heuristics based on a fast classifier called random forests outperform standard heuristics by up to 9.1% on average in terms of the WCET reduction. Moreover, we point out that our classifier is highly accurate with a prediction rate for inlining candidates of 84.0%.}, }The application of machine learning techniques in compiler frameworks has become a challenging research area. Learning algorithms are exploited for an automatic generation of optimization heuristics which often outperform hand-crafted models. Moreover, these automatic approaches can effectively tune the compilers' heuristics after larger changes in the optimization sequence or they can be leveraged to tailor heuristics towards a particular architectural model. Previous works focussed on a reduction of the average-case performance. In this paper, learning approaches are studied in the context of an automatic minimization of the worst-case execution time (WCET) which is the upper bound of the program's maximum execution time. We show that explicitly taking the new timing model into account allows the construction of compiler heuristics that effectively reduce the WCET. This is demonstrated for the well-known optimization function inlining. Our WCET-driven inlining heuristics based on a fast classifier called random forests outperform standard heuristics by up to 9.1% on average in terms of the WCET reduction. Moreover, we point out that our classifier is highly accurate with a prediction rate for inlining candidates of 84.0%. Paul Lokuciejewski, Fatih Gedikli and Peter Marwedel.Accelerating WCET-driven Optimizations by the Invariant Path Paradigm - a Case Study of Loop Unswitching. In The 12th International Workshop on Software & Compilers for Embedded Systems (SCOPES), pages 11-20Nice / France, April 2009[BibTeX][PDF][Abstract]@inproceedings { lokuciejewski:09:scopes, author = {Lokuciejewski, Paul and Gedikli, Fatih and Marwedel, Peter}, title = {Accelerating WCET-driven Optimizations by the Invariant Path Paradigm - a Case Study of Loop Unswitching}, booktitle = {The 12th International Workshop on Software \& Compilers for Embedded Systems (SCOPES)}, year = {2009}, pages = {11-20}, address = {Nice / France}, month = {apr}, keywords = {wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2009-scopes.pdf}, confidential = {n}, abstract = {The worst-case execution time (WCET) being the upper bound of the maximum execution time corresponds to the longest path through the program's control flow graph. Its reduction is the objective of a WCET optimization. Unlike average-case execution time compiler optimizations which consider a static (most frequently executed) path, the longest path is variable since its optimization might result in another path becoming the effective longest path. To keep path information valid, WCET optimizations typically perform a time-consuming static WCET analysis after each code modification to ensure that subsequent optimization steps operate on the critical path. However, a code modification does not always lead to a path switch, making many WCET analyses superfluous. To cope with this problem, we propose a new paradigm called Invariant Path which eliminates the pessimism by indicating whether a path update is mandatory. To demonstrate the paradigm's practical use, we developed a novel optimization called WCET-driven Loop Unswitching which exploits the Invariant Path information. In a case study, our optimization reduced the WCET of real-world benchmarks by up to 18.3\%, while exploiting the Invariant Path paradigm led to a reduction of the optimization time by 57.5\% on average.}, }The worst-case execution time (WCET) being the upper bound of the maximum execution time corresponds to the longest path through the program's control flow graph. Its reduction is the objective of a WCET optimization. Unlike average-case execution time compiler optimizations which consider a static (most frequently executed) path, the longest path is variable since its optimization might result in another path becoming the effective longest path. To keep path information valid, WCET optimizations typically perform a time-consuming static WCET analysis after each code modification to ensure that subsequent optimization steps operate on the critical path. However, a code modification does not always lead to a path switch, making many WCET analyses superfluous. To cope with this problem, we propose a new paradigm called Invariant Path which eliminates the pessimism by indicating whether a path update is mandatory. To demonstrate the paradigm's practical use, we developed a novel optimization called WCET-driven Loop Unswitching which exploits the Invariant Path information. In a case study, our optimization reduced the WCET of real-world benchmarks by up to 18.3%, while exploiting the Invariant Path paradigm led to a reduction of the optimization time by 57.5% on average. Paul Lokuciejewski, Heiko Falk and Peter Marwedel.WCET-driven Cache-based Procedure Positioning Optimizations. In The 20th Euromicro Conference on Real-Time Systems (ECRTS), pages 321-330Prague / Czech Republic, July 2008[BibTeX][PDF][Abstract]@inproceedings { loku:08:ecrts, author = {Lokuciejewski, Paul and Falk, Heiko and Marwedel, Peter}, title = {WCET-driven Cache-based Procedure Positioning Optimizations}, booktitle = {The 20th Euromicro Conference on Real-Time Systems (ECRTS)}, year = {2008}, pages = {321-330}, address = {Prague / Czech Republic}, month = {jul}, keywords = {wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2008-ecrts.pdf}, confidential = {n}, abstract = {Procedure Positioning is a well known compiler optimization aiming at the improvement of the instruction cache behavior. A contiguous mapping of procedures calling each other frequently in the memory avoids overlapping of cache lines and thus decreases the number of cache conflict misses. In standard literature, these positioning techniques are guided by execution profile data and focus on an improved average-case performance. We present two novel positioning optimizations driven by worst-case execution time (WCET) information to effectively minimize the program's worst-case behavior. WCET reductions by 10\% on average are achieved. Moreover, a combination of positioning and the WCET-driven Procedure Cloning optimization is presented improving the WCET analysis by 36\% on average.}, }Procedure Positioning is a well known compiler optimization aiming at the improvement of the instruction cache behavior. A contiguous mapping of procedures calling each other frequently in the memory avoids overlapping of cache lines and thus decreases the number of cache conflict misses. In standard literature, these positioning techniques are guided by execution profile data and focus on an improved average-case performance. We present two novel positioning optimizations driven by worst-case execution time (WCET) information to effectively minimize the program's worst-case behavior. WCET reductions by 10% on average are achieved. Moreover, a combination of positioning and the WCET-driven Procedure Cloning optimization is presented improving the WCET analysis by 36% on average. Andreas Heinig, René Oertel, Jochen Strunk, Wolfgang Rehm and Heiko Schick.Generalizing the SPUFS concept - a case study towards a common accelerator interface. In Proceedings of the Many-core and Reconfigurable Supercomputing ConferenceBelfast, 1-3 April 2008[BibTeX]@inproceedings { Heinig2008mrsc, author = {Heinig, Andreas and Oertel, Ren\'{e} and Strunk, Jochen and Rehm, Wolfgang and Schick, Heiko}, title = {Generalizing the SPUFS concept - a case study towards a common accelerator interface}, booktitle = {Proceedings of the Many-core and Reconfigurable Supercomputing Conference}, year = {2008}, address = {Belfast}, month = {1-3 April}, confidential = {n}, } Niklas Holsti, Jan Gustafsson, Guillem Bernat, Clément Ballabriga, Armelle Bonenfant, Roman Bourgade, Hugues Cassé, Daniel Cordes, Albrecht Kadlec, Raimund Kirner, Jens Knoop, Paul Lokuciejewski and Merriam.WCET Tool Challenge 2008: Report. In International Workshop on Worst-Case Execution Time Analysis (WCET)Prague / Czech Republic, September 2008[BibTeX][PDF][Abstract]@inproceedings { holsti:08:wcet, author = {Holsti, Niklas and Gustafsson, Jan and Bernat, Guillem and Ballabriga, Cl\'ement and Bonenfant, Armelle and Bourgade, Roman and Cass\'e, Hugues and Cordes, Daniel and Kadlec, Albrecht and Kirner, Raimund and Knoop, Jens and Lokuciejewski, Paul and Merriam,}, title = {WCET Tool Challenge 2008: Report}, booktitle = {International Workshop on Worst-Case Execution Time Analysis (WCET)}, year = {2008}, address = {Prague / Czech Republic}, month = {sep}, keywords = {wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2008-wcet.pdf}, confidential = {n}, abstract = {Following the successful WCET Tool Challenge in 2006, the second event in this series was organized in 2008, again with support from the ARTIST2 Network of Excellence. The WCET Tool Challenge 2008 (WCC'08) provides benchmark programs and poses a number of "analysis problems" about the dynamic, runtime properties of these programs. The participants are challenged to solve these problems with their program-analysis tools. Two kinds of problems are defined: WCET problems, which ask for bounds on the execution time of chosen parts (subprograms) of the benchmarks, under given constraints on input data; and flow-analysis problems, which ask for bounds on the number of times certain parts of the benchmark can be executed, again under some constraints. We describe the organization of WCC'08, the benchmark programs, the participating tools, and the general results, successes, and failures. Most participants found WCC'08 to be a useful test of their tools. Unlike the 2006 Challenge, the WCC'08 participants include several tools for the same target (ARM7, LPC2138), and tools that combine measurements and static analysis, as well as pure static-analysis tools.}, }Following the successful WCET Tool Challenge in 2006, the second event in this series was organized in 2008, again with support from the ARTIST2 Network of Excellence. The WCET Tool Challenge 2008 (WCC'08) provides benchmark programs and poses a number of "analysis problems" about the dynamic, runtime properties of these programs. The participants are challenged to solve these problems with their program-analysis tools. Two kinds of problems are defined: WCET problems, which ask for bounds on the execution time of chosen parts (subprograms) of the benchmarks, under given constraints on input data; and flow-analysis problems, which ask for bounds on the number of times certain parts of the benchmark can be executed, again under some constraints. We describe the organization of WCC'08, the benchmark programs, the participating tools, and the general results, successes, and failures. Most participants found WCC'08 to be a useful test of their tools. Unlike the 2006 Challenge, the WCC'08 participants include several tools for the same target (ARM7, LPC2138), and tools that combine measurements and static analysis, as well as pure static-analysis tools. Paul Lokuciejewski, Heiko Falk, Peter Marwedel and Henrik Theiling.WCET-Driven, Code-Size Critical Procedure Cloning. In The 11th International Workshop on Software & Compilers for Embedded Systems (SCOPES), pages 21-30Munich / Germany, March 2008[BibTeX][PDF][Abstract]@inproceedings { loku:08:scopes, author = {Lokuciejewski, Paul and Falk, Heiko and Marwedel, Peter and Theiling, Henrik}, title = {WCET-Driven, Code-Size Critical Procedure Cloning}, booktitle = {The 11th International Workshop on Software \& Compilers for Embedded Systems (SCOPES)}, year = {2008}, pages = {21-30}, address = {Munich / Germany}, month = {mar}, keywords = {wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2008-scopes.pdf}, confidential = {n}, abstract = {In the domain of the worst-case execution time (WCET) analysis, loops are an inherent source of unpredictability and loss of precision since the determination of tight and safe information on the number of loop iterations is a diffi- cult task. In particular, data-dependent loops whose itera- tion counts depend on function parameters can not be pre- cisely handled by a timing analysis. Procedure Cloning can be exploited to make these loops explicit within the source code allowing a highly precise WCET analysis. In this paper we extend the standard Procedure Cloning optimization by WCET-aware concepts with the objective to improve the tightness of the WCET estimation. Our novel approach is driven by WCET information which succes- sively eliminates code structures leading to overestimated timing results, thus making the code more suitable for the analysis. In addition, the code size increase during the op- timization is monitored and large increases are avoided. The effectiveness of our optimization is shown by tests on real-world benchmarks. After performing our optimiza- tion, the estimated WCET is reduced by up to 64.2\% while the employed code transformations yield an additional code size increase of 22.6\% on average. In contrast, the average- case performance being the original objective of Procedure Cloning showed a slight decrease.}, }In the domain of the worst-case execution time (WCET) analysis, loops are an inherent source of unpredictability and loss of precision since the determination of tight and safe information on the number of loop iterations is a diffi- cult task. In particular, data-dependent loops whose itera- tion counts depend on function parameters can not be pre- cisely handled by a timing analysis. Procedure Cloning can be exploited to make these loops explicit within the source code allowing a highly precise WCET analysis. In this paper we extend the standard Procedure Cloning optimization by WCET-aware concepts with the objective to improve the tightness of the WCET estimation. Our novel approach is driven by WCET information which succes- sively eliminates code structures leading to overestimated timing results, thus making the code more suitable for the analysis. In addition, the code size increase during the op- timization is monitored and large increases are avoided. The effectiveness of our optimization is shown by tests on real-world benchmarks. After performing our optimiza- tion, the estimated WCET is reduced by up to 64.2% while the employed code transformations yield an additional code size increase of 22.6% on average. In contrast, the average- case performance being the original objective of Procedure Cloning showed a slight decrease. Sascha Plazar, Paul Lokuciejewski and Peter Marwedel.A Retargetable Framework for Multi-objective WCET-aware High-level Compiler Optimizations. In Proceedings of The 29th IEEE Real-Time Systems Symposium (RTSS) WiP, pages 49-52Barcelona / Spain, December 2008[BibTeX][PDF][Abstract]@inproceedings { plazar:08:rtss, author = {Plazar, Sascha and Lokuciejewski, Paul and Marwedel, Peter}, title = {A Retargetable Framework for Multi-objective WCET-aware High-level Compiler Optimizations}, booktitle = {Proceedings of The 29th IEEE Real-Time Systems Symposium (RTSS) WiP}, year = {2008}, pages = {49-52}, address = {Barcelona / Spain}, month = {dec}, keywords = {wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2008-rtss.pdf}, confidential = {n}, abstract = {The worst-case execution time (WCET) is a key parameter in the domain of real-time systems and its automatic compiler-based minimization becomes a challenging research area. Although today's embedded system applications are written in a high-level language, most published works consider low-level optimizations which complicate their portability to other processors. In this work, we present a framework for the development of novel WCETdriven high-level optimizations. Our WCET-aware compiler framework provides a multi-target support as well as an integration of different non-functional objectives. It enables multi-objective optimizations, thus opens avenues to a state-of-the-art design of predictable and efficient systems. In addition, the multi-target support provides the opportunity to efficiently evaluate the impact of different compiler optimizations on various processors.}, }The worst-case execution time (WCET) is a key parameter in the domain of real-time systems and its automatic compiler-based minimization becomes a challenging research area. Although today's embedded system applications are written in a high-level language, most published works consider low-level optimizations which complicate their portability to other processors. In this work, we present a framework for the development of novel WCETdriven high-level optimizations. Our WCET-aware compiler framework provides a multi-target support as well as an integration of different non-functional objectives. It enables multi-objective optimizations, thus opens avenues to a state-of-the-art design of predictable and efficient systems. In addition, the multi-target support provides the opportunity to efficiently evaluate the impact of different compiler optimizations on various processors. Peter Marwedel and Heiko Falk (presentation).Memory-architecture aware compilation. In The ARTIST2 Summer School 2008 in EuropeAutrans / France, 2008[BibTeX][PDF]@inproceedings { marwedel:08:artist2, author = {Marwedel, Peter and Falk (presentation), Heiko}, title = {Memory-architecture aware compilation}, booktitle = {The ARTIST2 Summer School 2008 in Europe}, year = {2008}, address = {Autrans / France}, keywords = {wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2008-artist2summerschool.pdf}, confidential = {n}, } Michael Engel and Olaf Spinczyk.Aspects in Hardware - What Do They Look Like?. In Proceedings of the 7th AOSD Workshop on Aspects, Components, and Patterns for Infrastructure Software (AOSD-ACP4IS '08)Brussels, Belgium, April 2008[BibTeX]@inproceedings { engel:08:aosd-acp4is, author = {Engel, Michael and Spinczyk, Olaf}, title = {Aspects in Hardware - What Do They Look Like?}, booktitle = {Proceedings of the 7th AOSD Workshop on Aspects, Components, and Patterns for Infrastructure Software (AOSD-ACP4IS '08)}, year = {2008}, address = {Brussels, Belgium}, month = {apr}, publisher = {ACM Press}, confidential = {n}, } Michael Engel and Olaf Spinczyk.System-on-Chip Integration of Embedded Automotive Controllers. In Proceedings of the First Workshop on Isolation and Integration in Embedded SystemsGlasgow, UK, April 2008[BibTeX]@inproceedings { engel:08:eurosys-iies, author = {Engel, Michael and Spinczyk, Olaf}, title = {System-on-Chip Integration of Embedded Automotive Controllers}, booktitle = {Proceedings of the First Workshop on Isolation and Integration in Embedded Systems}, year = {2008}, address = {Glasgow, UK}, month = {apr}, publisher = {ACM Press}, confidential = {n}, } Paul Lokuciejewski, Heiko Falk, Martin Schwarzer and Peter Marwedel.Tighter WCET Estimates by Procedure Cloning. In 7th International Workshop on Worst-Case Execution Time Analysis (WCET), pages 27-32Pisa/Italy, July 2007[BibTeX][PDF][Abstract]@inproceedings { loku:07:wcet, author = {Lokuciejewski, Paul and Falk, Heiko and Schwarzer, Martin and Marwedel, Peter}, title = {Tighter WCET Estimates by Procedure Cloning}, booktitle = {7th International Workshop on Worst-Case Execution Time Analysis (WCET)}, year = {2007}, pages = {27-32}, address = {Pisa/Italy}, month = {jul}, keywords = {wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2007-wcet.pdf}, confidential = {n}, abstract = {Embedded software spends most of its execution time in loops. To allow a precise static WCET analysis, each loop iteration should, in theory, be represented by an individual calling context. However, due to the enormous analysis times of real-world applications, this approach is not feasible and requires a reduction of the analysis complexity by limiting the number of considered contexts. This restricted timing analysis results in imprecise WCET estimates. In particular, data-dependent loops with iteration counts depending on function parameters cannot be precisely analyzed. In order to reduce the number of contexts that must be implicitly considered, causing an increase in analysis time, we apply the standard compiler optimization \textem{procedure cloning} which improves the program's predictability by making loops explicit and thus allowing a precise annotation of loop bounds. The result is a tight WCET estimation within a reduced analysis time. Our results indicate that reductions of the WCET between 12\% and 95\% were achieved for real-world benchmarks. In contrast, the reduction of the simulated program execution time remained marginal with only 3\%. As will be also shown, this optimization only produces a small overhead for the WCET analysis.}, }Embedded software spends most of its execution time in loops. To allow a precise static WCET analysis, each loop iteration should, in theory, be represented by an individual calling context. However, due to the enormous analysis times of real-world applications, this approach is not feasible and requires a reduction of the analysis complexity by limiting the number of considered contexts. This restricted timing analysis results in imprecise WCET estimates. In particular, data-dependent loops with iteration counts depending on function parameters cannot be precisely analyzed. In order to reduce the number of contexts that must be implicitly considered, causing an increase in analysis time, we apply the standard compiler optimization procedure cloning which improves the program's predictability by making loops explicit and thus allowing a precise annotation of loop bounds. The result is a tight WCET estimation within a reduced analysis time. Our results indicate that reductions of the WCET between 12% and 95% were achieved for real-world benchmarks. In contrast, the reduction of the simulated program execution time remained marginal with only 3%. As will be also shown, this optimization only produces a small overhead for the WCET analysis. P. Reinhardt, O. Battenfeld, M. Engel and B. Freisleben.A Paravirtualized Scalable Emulation Testbed for Mobile Ad-Hoc Networks. In Proceedings of ICCP07, Oman 2007[BibTeX]@inproceedings { engel:07:iccp, author = {Reinhardt, P. and Battenfeld, O. and Engel, M. and Freisleben, B.}, title = {A Paravirtualized Scalable Emulation Testbed for Mobile Ad-Hoc Networks}, booktitle = {Proceedings of ICCP07, Oman}, year = {2007}, publisher = {IEEE Computer Society Press}, confidential = {n}, } Robert Pyka, Christoph Faßbach, Manish Verma, Heiko Falk and Peter Marwedel.Operating system integrated energy aware scratchpad allocation strategies for multiprocess applications. In 10th International Workshop on Software & Compilers for Embedded Systems (SCOPES), pages 41-50Nice/France, April 2007[BibTeX][PDF][Abstract]@inproceedings { pyka:07:scopes, author = {Pyka, Robert and Fa\"sbach, Christoph and Verma, Manish and Falk, Heiko and Marwedel, Peter}, title = {Operating system integrated energy aware scratchpad allocation strategies for multiprocess applications}, booktitle = {10th International Workshop on Software \& Compilers for Embedded Systems (SCOPES)}, year = {2007}, pages = {41-50}, address = {Nice/France}, month = {apr}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2007-scopes.pdf}, confidential = {n}, abstract = {Various scratchpad allocation strategies have been developed in the past. Most of them target the reduction of energy consumption. These approaches share the necessity of having direct access to the scratchpad memory. In earlier embedded systems this was always true, but with the increasing complexity of tasks systems have to perform, an additional operating system layer between the hardware and the application is becoming mandatory. This paper presents an approach to integrate a scratchpad memory manager into the operating system. The goal is to minimize energy consumption. In contrast to previous work, compile time knowledge about the application's behavior is taken into account. A set of fast heuristic allocation methods is proposed in this paper. An in-depth study and comparison of achieved energy savings and cycle reductions was performed. The results show that even in the highly dynamic environment of an operating system equipped embedded system, up to 83% energy consumption reduction can be achieved.}, }Various scratchpad allocation strategies have been developed in the past. Most of them target the reduction of energy consumption. These approaches share the necessity of having direct access to the scratchpad memory. In earlier embedded systems this was always true, but with the increasing complexity of tasks systems have to perform, an additional operating system layer between the hardware and the application is becoming mandatory. This paper presents an approach to integrate a scratchpad memory manager into the operating system. The goal is to minimize energy consumption. In contrast to previous work, compile time knowledge about the application's behavior is taken into account. A set of fast heuristic allocation methods is proposed in this paper. An in-depth study and comparison of achieved energy savings and cycle reductions was performed. The results show that even in the highly dynamic environment of an operating system equipped embedded system, up to 83% energy consumption reduction can be achieved. Heiko Falk, Sascha Plazar and Henrik Theiling.Compile Time Decided Instruction Cache Locking Using Worst-Case Execution Paths. In International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS, pages 143-148Salzburg/Austria, September 2007[BibTeX][PDF][Abstract]@inproceedings { falk:07:codes_isss, author = {Falk, Heiko and Plazar, Sascha and Theiling, Henrik}, title = {Compile Time Decided Instruction Cache Locking Using Worst-Case Execution Paths}, booktitle = {International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS}, year = {2007}, pages = {143-148}, address = {Salzburg/Austria}, month = {sep}, keywords = {wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2007-codes+isss_1.pdf}, confidential = {n}, abstract = {Caches are notorious for their unpredictability. It is difficult or even impossible to predict if a memory access results in a definite cache hit or miss. This unpredictability is highly undesired for real-time systems. The Worst-Case Execution Time \textem{(WCET)} of a software running on an embedded processor is one of the most important metrics during real-time system design. The WCET depends to a large extent on the total amount of time spent for memory accesses. In the presence of caches, WCET analysis must always assume a memory access to be a cache miss if it can not be guaranteed that it is a hit. Hence, WCETs for cached systems are imprecise due to the overestimation caused by the caches. Modern caches can be controlled by software. The software can load parts of its code or of its data into the cache and lock the cache afterwards. Cache locking prevents the cache's contents from being flushed by deactivating the replacement. A locked cache is highly predictable and leads to very precise WCET estimates, because the uncertainty caused by the replacement strategy is eliminated completely. This paper presents techniques exploring the lockdown of instruction caches at compile-time to minimize WCETs. In contrast to the current state of the art in the area of cache locking, our techniques explicitly take the worst-case execution path into account during each step of the optimization procedure. This way, we can make sure that always those parts of the code are locked in the I-cache that lead to the highest WCET reduction. The results demonstrate that WCET reductions from 54\% up to 73\% can be achieved with an acceptable amount of CPU seconds required for the optimization and WCET analyses themselves.}, }Caches are notorious for their unpredictability. It is difficult or even impossible to predict if a memory access results in a definite cache hit or miss. This unpredictability is highly undesired for real-time systems. The Worst-Case Execution Time (WCET) of a software running on an embedded processor is one of the most important metrics during real-time system design. The WCET depends to a large extent on the total amount of time spent for memory accesses. In the presence of caches, WCET analysis must always assume a memory access to be a cache miss if it can not be guaranteed that it is a hit. Hence, WCETs for cached systems are imprecise due to the overestimation caused by the caches. Modern caches can be controlled by software. The software can load parts of its code or of its data into the cache and lock the cache afterwards. Cache locking prevents the cache's contents from being flushed by deactivating the replacement. A locked cache is highly predictable and leads to very precise WCET estimates, because the uncertainty caused by the replacement strategy is eliminated completely. This paper presents techniques exploring the lockdown of instruction caches at compile-time to minimize WCETs. In contrast to the current state of the art in the area of cache locking, our techniques explicitly take the worst-case execution path into account during each step of the optimization procedure. This way, we can make sure that always those parts of the code are locked in the I-cache that lead to the highest WCET reduction. The results demonstrate that WCET reductions from 54% up to 73% can be achieved with an acceptable amount of CPU seconds required for the optimization and WCET analyses themselves. Paul Lokuciejewski, Heiko Falk, Martin Schwarzer, Peter Marwedel and Henrik Theiling.Influence of Procedure Cloning on WCET Prediction. In International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS), pages 137-142Salzburg/Austria, September 2007[BibTeX][PDF][Abstract]@inproceedings { loku:07:codes_isss, author = {Lokuciejewski, Paul and Falk, Heiko and Schwarzer, Martin and Marwedel, Peter and Theiling, Henrik}, title = {Influence of Procedure Cloning on WCET Prediction}, booktitle = {International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS)}, year = {2007}, pages = {137-142}, address = {Salzburg/Austria}, month = {sep}, keywords = {wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2007-codes+isss_2.pdf}, confidential = {n}, abstract = {For the worst-case execution time \textem{(WCET)} analysis, especially loops are an inherent source of unpredictability and loss of precision. This is caused by the difficulty to obtain safe and tight information on the number of iterations executed by a loop in the worst case. In particular, data-dependent loops whose iteration counts depend on function parameters are extremely difficult to analyze precisely. Procedure cloning helps by making such data-dependent loops explicit within the source code, thus making them accessible for high-precision WCET analyses. This paper presents the effect of procedure cloning applied at the source-code level on worst-case execution time. The optimization generates specialized versions of functions being called with constant values as arguments. In standard literature, it is used to enable further optimizations like constant propagation within functions and to reduce calling overhead. We show that procedure cloning for WCET minimization leads to significant improvements. Reductions of the WCET from 12\% up to 95\% were measured for real-life benchmarks. These results demonstrate that procedure cloning improves analyzability and predictability of real-time applications dramatically. In contrast, average-case performance as the criterion procedure cloning was developed for is reduced by only 3\% at most. Our results also show that these WCET reductions only implied small overhead during WCET analysis.}, }For the worst-case execution time (WCET) analysis, especially loops are an inherent source of unpredictability and loss of precision. This is caused by the difficulty to obtain safe and tight information on the number of iterations executed by a loop in the worst case. In particular, data-dependent loops whose iteration counts depend on function parameters are extremely difficult to analyze precisely. Procedure cloning helps by making such data-dependent loops explicit within the source code, thus making them accessible for high-precision WCET analyses. This paper presents the effect of procedure cloning applied at the source-code level on worst-case execution time. The optimization generates specialized versions of functions being called with constant values as arguments. In standard literature, it is used to enable further optimizations like constant propagation within functions and to reduce calling overhead. We show that procedure cloning for WCET minimization leads to significant improvements. Reductions of the WCET from 12% up to 95% were measured for real-life benchmarks. These results demonstrate that procedure cloning improves analyzability and predictability of real-time applications dramatically. In contrast, average-case performance as the criterion procedure cloning was developed for is reduced by only 3% at most. Our results also show that these WCET reductions only implied small overhead during WCET analysis. Peter Marwedel, Heiko Falk, Sascha Plazar, Robert Pyka and Lars Wehmeyer.Automatic mapping to tightly-coupled memories and cache locking. In Proceedings of 4th HiPEAC Industrial Workshop on Compilers and ArchitecturesCambridge, UK, August 2007[BibTeX][PDF][Link]@inproceedings { marwedel:07:hipeac, author = {Marwedel, Peter and Falk, Heiko and Plazar, Sascha and Pyka, Robert and Lars Wehmeyer,}, title = {Automatic mapping to tightly-coupled memories and cache locking}, booktitle = {Proceedings of 4th HiPEAC Industrial Workshop on Compilers and Architectures}, year = {2007}, address = {Cambridge, UK}, month = {aug}, url = {http://www.hipeac.net/industry_workshop4}, keywords = {wcet}, file = {http://www.hipeac.net/system/files?file=session1_3.ppt}, confidential = {n}, } Heiko Falk, Paul Lokuciejewski and Henrik Theiling.Design of a WCET-Aware C Compiler. In 6th International Workshop on Worst-Case Execution Time Analysis (WCET)Dresden/Germany, July 2006[BibTeX][PDF][Abstract]@inproceedings { falk:06:wcet, author = {Falk, Heiko and Lokuciejewski, Paul and Theiling, Henrik}, title = {Design of a WCET-Aware C Compiler}, booktitle = {6th International Workshop on Worst-Case Execution Time Analysis (WCET)}, year = {2006}, address = {Dresden/Germany}, month = {jul}, keywords = {wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2006-wcet_1.pdf}, confidential = {n}, abstract = {This paper presents techniques to tightly integrate worst-case execution time \textem{(WCET)} information into a compiler framework. Currently, a tight integration of WCET information into the compilation process is strongly desired, but only some ad-hoc approaches have been reported currently. Previous publications mainly used self-written WCET estimators with very limited functionality and preciseness during compilation. A very tight integration of a high quality industry-relevant WCET analyzer into a compiler was not yet achieved up to now. This work is the first to present techniques capable of achieving such a tight coupling between a compiler and the WCET analyzer aiT. This is done by automatically translating the assembly-like contents of the compiler's low-level intermediate representation \textem{(LLIR)} to aiT's exchange format CRL2. Additionally, the results produced by the WCET analyzer are automatically collected and re-imported into the compiler infrastructure. The work described in this paper is smoothly integrated into a C compiler environment for the Infineon TriCore processor. It opens up new possibilities for the design of WCET-aware optimizations in the future. The concepts for extending the compiler infrastructure are kept very general so that they are not limited to WCET information. Rather, it is possible to use our structures also for multi-objective optimization of e.g. best-case execution time \textem{(BCET)} or energy dissipation.}, }This paper presents techniques to tightly integrate worst-case execution time (WCET) information into a compiler framework. Currently, a tight integration of WCET information into the compilation process is strongly desired, but only some ad-hoc approaches have been reported currently. Previous publications mainly used self-written WCET estimators with very limited functionality and preciseness during compilation. A very tight integration of a high quality industry-relevant WCET analyzer into a compiler was not yet achieved up to now. This work is the first to present techniques capable of achieving such a tight coupling between a compiler and the WCET analyzer aiT. This is done by automatically translating the assembly-like contents of the compiler's low-level intermediate representation (LLIR) to aiT's exchange format CRL2. Additionally, the results produced by the WCET analyzer are automatically collected and re-imported into the compiler infrastructure. The work described in this paper is smoothly integrated into a C compiler environment for the Infineon TriCore processor. It opens up new possibilities for the design of WCET-aware optimizations in the future. The concepts for extending the compiler infrastructure are kept very general so that they are not limited to WCET information. Rather, it is possible to use our structures also for multi-objective optimization of e.g. best-case execution time (BCET) or energy dissipation. M. Smith, B. Klose, R. Ewerth, T. Friese, M. Engel and B. Freisleben.Runtime Integration of Reconfigurable Hardware in Service-Oriented Grids. In Proceedings of the IEEE International Conference on Web Services (ICWS), Chicago, USA, pages 945-948 2006[BibTeX]@inproceedings { engel:06:icws, author = {Smith, M. and Klose, B. and Ewerth, R. and Friese, T. and Engel, M. and Freisleben, B.}, title = {Runtime Integration of Reconfigurable Hardware in Service-Oriented Grids}, booktitle = {Proceedings of the IEEE International Conference on Web Services (ICWS), Chicago, USA}, year = {2006}, pages = {945-948}, publisher = {IEEE Computer Society Press}, confidential = {n}, } M. Smith, T. Friese, M. Engel, B. Freisleben, G. Koenig and W. Yurcik.Security Issues in On-Demand Grid and Cluster Computing. In Sixth IEEE International Symposium on Cluster Computing and the Grid Workshops (CCGRIDW'06), pages 24 2006[BibTeX]@inproceedings { engel:06:iscc, author = {Smith, M. and Friese, T. and Engel, M. and Freisleben, B. and Koenig, G. and Yurcik, W.}, title = {Security Issues in On-Demand Grid and Cluster Computing}, booktitle = {Sixth IEEE International Symposium on Cluster Computing and the Grid Workshops (CCGRIDW'06)}, year = {2006}, pages = {24}, publisher = {IEEE Computer Society Press}, confidential = {n}, } M. Smith, T. Friese, M. Engel and B. Freisleben.Countering Security Threats in Service-Oriented On-Demand Grid Computing Using Sandboxing and Trusted Computing Techniques. In Journal of Parallel and Distributed Computing, Volume 66, Issue 9, pages 1189-1204 2006[BibTeX]@inproceedings { engel:06:jpdc, author = {Smith, M. and Friese, T. and Engel, M. and Freisleben, B.}, title = {Countering Security Threats in Service-Oriented On-Demand Grid Computing Using Sandboxing and Trusted Computing Techniques}, booktitle = {Journal of Parallel and Distributed Computing, Volume 66, Issue 9}, year = {2006}, pages = {1189-1204}, publisher = {Elsevier}, confidential = {n}, } Heiko Falk and Martin Schwarzer.Loop Nest Splitting for WCET-Optimization and Predictability Improvement. In 6th International Workshop on Worst-Case Execution Time Analysis (WCET)Dresden/Germany, July 2006[BibTeX][PDF][Abstract]@inproceedings { falk:06:wcet2, author = {Falk, Heiko and Schwarzer, Martin}, title = {Loop Nest Splitting for WCET-Optimization and Predictability Improvement}, booktitle = {6th International Workshop on Worst-Case Execution Time Analysis (WCET)}, year = {2006}, address = {Dresden/Germany}, month = {jul}, keywords = {sco, wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2006-wcet_2.pdf}, confidential = {n}, abstract = {This paper presents the influence of the loop nest splitting source code optimization on the worst-case execution time \textem{(WCET)}. Loop nest splitting minimizes the number of executed if-statements in loop nests of embedded multimedia applications. It identifies iterations of a loop nest where all if-statements are satisfied and splits the loop nest such that if-statements are not executed at all for large parts of the loop nest's iteration space. Especially loops and if-statements of high-level languages are an inherent source of unpredictability and loss of precision for WCET analysis. This is caused by the fact that it is difficult to obtain safe and tight worst-case estimates of an application's flow of control through these high-level constructs. In addition, the corresponding control flow redirections expressed at the assembly level reduce predictability even more due to the complex pipeline and branch prediction behavior of modern embedded processors. The analysis techniques for loop nest splitting are based on precise mathematical models combined with genetic algorithms. On the one hand, these techniques achieve a significantly more homogeneous structure of the control flow. On the other hand, the precision of our analyses leads to the generation of very accurate high-level flow facts for loops and if-statements. The application of our implemented algorithms to three real-life multimedia benchmarks leads to average speed-ups by 25.0\% - 30.1\%, while WCET is reduced between 34.0\% and 36.3\%.}, }This paper presents the influence of the loop nest splitting source code optimization on the worst-case execution time (WCET). Loop nest splitting minimizes the number of executed if-statements in loop nests of embedded multimedia applications. It identifies iterations of a loop nest where all if-statements are satisfied and splits the loop nest such that if-statements are not executed at all for large parts of the loop nest's iteration space. Especially loops and if-statements of high-level languages are an inherent source of unpredictability and loss of precision for WCET analysis. This is caused by the fact that it is difficult to obtain safe and tight worst-case estimates of an application's flow of control through these high-level constructs. In addition, the corresponding control flow redirections expressed at the assembly level reduce predictability even more due to the complex pipeline and branch prediction behavior of modern embedded processors. The analysis techniques for loop nest splitting are based on precise mathematical models combined with genetic algorithms. On the one hand, these techniques achieve a significantly more homogeneous structure of the control flow. On the other hand, the precision of our analyses leads to the generation of very accurate high-level flow facts for loops and if-statements. The application of our implemented algorithms to three real-life multimedia benchmarks leads to average speed-ups by 25.0% - 30.1%, while WCET is reduced between 34.0% and 36.3%. Michael Engel and Bernd Freisleben.{TOSKANA:} A Toolkit for Operating System Kernel Aspects. In Transactions on AOSD II 4242, pages 182--226 2006[BibTeX]@inproceedings { engel:06:taosd, author = {Engel, Michael and Freisleben, Bernd}, title = {{TOSKANA:} A Toolkit for Operating System Kernel Aspects}, booktitle = {Transactions on AOSD II}, year = {2006}, editor = {Awais Rashid and Mehmet Aksit}, number = {4242}, series = {Lecture Notes in Computer Science}, pages = {182--226}, publisher = {Springer-Verlag}, confidential = {n}, } Manish Verma and Peter Marwedel.Compilation and Simulation Tool Chain for Memory Aware Energy Optimizations. In Workshop on Embedded Computer Systems: Architectures, Modeling, and Simulation (SAMOS VI)Samos, Greece, July 2006[BibTeX][PDF][Abstract]@inproceedings { verma:06:samos, author = {Verma, Manish and Marwedel, Peter}, title = {Compilation and Simulation Tool Chain for Memory Aware Energy Optimizations}, booktitle = {Workshop on Embedded Computer Systems: Architectures, Modeling, and Simulation (SAMOS VI)}, year = {2006}, address = {Samos, Greece}, month = {jul}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2006-samos.pdf}, confidential = {n}, abstract = {Memory hierarchies are known to be the energy bottleneck of portable embedded devices. Numerous memory aware energy optimizations have been proposed. However, both the optimization and the validation is performed in an ad-hoc manner as a coherent compilation and simulation framework does not exist as yet. In this paper, we present such a framework for performing memory hierarchy aware energy optimization. Both the compiler and the simulator are configured from a single memory hierarchy description. Significant savings of up to 50\% in the total energy dissipation are reported.}, }Memory hierarchies are known to be the energy bottleneck of portable embedded devices. Numerous memory aware energy optimizations have been proposed. However, both the optimization and the validation is performed in an ad-hoc manner as a coherent compilation and simulation framework does not exist as yet. In this paper, we present such a framework for performing memory hierarchy aware energy optimization. Both the compiler and the simulator are configured from a single memory hierarchy description. Significant savings of up to 50% in the total energy dissipation are reported. M. Smith, M. Engel, S. Hanemann and B. Freisleben.Towards a Roadcasting Communications Infrastructure. In Proccedings of the IEEE International Conference on Mobile Communications and Learning Technologies, pages 213-213 2006[BibTeX]@inproceedings { engel:06:icmclt, author = {Smith, M. and Engel, M. and Hanemann, S. and Freisleben, B.}, title = {Towards a Roadcasting Communications Infrastructure}, booktitle = {Proccedings of the IEEE International Conference on Mobile Communications and Learning Technologies}, year = {2006}, pages = {213-213}, publisher = {IEEE Computer Society Press}, confidential = {n}, } Heiko Falk, Jens Wagner and André Schaefer.Use of a Bit-true Data Flow Analysis for Processor-Specific Source Code Optimization. In 4th IEEE Workshop on Embedded Systems for Real-Time Multimedia (ESTIMedia), pages 133-138Seoul/Korea, October 2006[BibTeX][PDF][Abstract]@inproceedings { falk:06:estimedia, author = {Falk, Heiko and Wagner, Jens and Schaefer, Andr\'e}, title = {Use of a Bit-true Data Flow Analysis for Processor-Specific Source Code Optimization}, booktitle = {4th IEEE Workshop on Embedded Systems for Real-Time Multimedia (ESTIMedia)}, year = {2006}, pages = {133-138}, address = {Seoul/Korea}, month = {oct}, keywords = {sco}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2006-estimedia_1.pdf}, confidential = {n}, abstract = {Nowadays, key characteristics of a processor's instruction set are only exploited in high-level languages by using inline assembly or compiler intrinsics. Inserting intrinsics into the source code is up to the programmer, since only few automatic approaches exist. Additionally, these approaches base on simple code pattern matching strategies. This paper presents techniques for processor-specific code analysis and optimization at the source-level. It is shown how a bit-true data flow analysis is made applicable for source code analysis for the TI C6x DSPs for the very first time. Based on this bit-true analysis, fully automated optimizations superior to conventional pattern matching techniques are presented which optimize saturated arithmetic, reduce bitwidths of variables and exploit SIMD data processing within source codes. The application of our implemented algorithms to complex real-life codes leads to speed-ups between 33\% - 48\% for the optimization of saturated arithmetic, and up to 16\% after SIMD optimization.}, }Nowadays, key characteristics of a processor's instruction set are only exploited in high-level languages by using inline assembly or compiler intrinsics. Inserting intrinsics into the source code is up to the programmer, since only few automatic approaches exist. Additionally, these approaches base on simple code pattern matching strategies. This paper presents techniques for processor-specific code analysis and optimization at the source-level. It is shown how a bit-true data flow analysis is made applicable for source code analysis for the TI C6x DSPs for the very first time. Based on this bit-true analysis, fully automated optimizations superior to conventional pattern matching techniques are presented which optimize saturated arithmetic, reduce bitwidths of variables and exploit SIMD data processing within source codes. The application of our implemented algorithms to complex real-life codes leads to speed-ups between 33% - 48% for the optimization of saturated arithmetic, and up to 16% after SIMD optimization. Heiko Falk and Martin Schwarzer.Loop Nest Splitting for WCET-Optimization and Predictability Improvement. In 4th IEEE Workshop on Embedded Systems for Real-Time Multimedia (ESTIMedia), pages 115-120Seoul/Korea, October 2006[BibTeX][PDF][Abstract]@inproceedings { falk:06:estimedia2, author = {Falk, Heiko and Schwarzer, Martin}, title = {Loop Nest Splitting for WCET-Optimization and Predictability Improvement}, booktitle = {4th IEEE Workshop on Embedded Systems for Real-Time Multimedia (ESTIMedia)}, year = {2006}, pages = {115-120}, address = {Seoul/Korea}, month = {oct}, keywords = {sco, wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2006-estimedia_2.pdf}, confidential = {n}, abstract = {This paper presents the effect of the loop nest splitting source code optimization on worst-case execution time \textem{(WCET)}. Loop nest splitting minimizes the number of executed if-statements in loop nests of multimedia applications. It identifies iterations where all if-statements are satisfied and splits the loop nest such that if-statements are not executed at all for large parts of the loop nest's iteration space. Especially loops and if-statements are an inherent source of unpredictability and loss of precision for WCET analysis. This is caused by the difficulty to obtain safe and tight worst-case estimates of an application's high-level control flow. In addition, assembly-level control flow redirections reduce predictability even more due to complex processor pipelines and branch prediction units. Loop nest splitting bases on precise mathematical models combined with genetic algorithms. On the one hand, these techniques achieve a significantly more homogeneous control flow structure. On the other hand, the precision of our analyses enables to generate very accurate high-level flow facts for loops and if-statements. The application of our implemented algorithms to three real-life benchmarks leads to average speed-ups by 25.0\% - 30.1\%, while WCET is reduced by 34.0\% - 36.3\%.}, }This paper presents the effect of the loop nest splitting source code optimization on worst-case execution time (WCET). Loop nest splitting minimizes the number of executed if-statements in loop nests of multimedia applications. It identifies iterations where all if-statements are satisfied and splits the loop nest such that if-statements are not executed at all for large parts of the loop nest's iteration space. Especially loops and if-statements are an inherent source of unpredictability and loss of precision for WCET analysis. This is caused by the difficulty to obtain safe and tight worst-case estimates of an application's high-level control flow. In addition, assembly-level control flow redirections reduce predictability even more due to complex processor pipelines and branch prediction units. Loop nest splitting bases on precise mathematical models combined with genetic algorithms. On the one hand, these techniques achieve a significantly more homogeneous control flow structure. On the other hand, the precision of our analyses enables to generate very accurate high-level flow facts for loops and if-statements. The application of our implemented algorithms to three real-life benchmarks leads to average speed-ups by 25.0% - 30.1%, while WCET is reduced by 34.0% - 36.3%. Heiko Falk, Paul Lokuciejewski and Henrik Theiling.Design of a WCET-Aware C Compiler. In 4th IEEE Workshop on Embedded Systems for Real-Time Multimedia (ESTIMedia), pages 121-126Seoul/Korea, October 2006[BibTeX][PDF][Abstract]@inproceedings { falk:06:estimedia3, author = {Falk, Heiko and Lokuciejewski, Paul and Theiling, Henrik}, title = {Design of a WCET-Aware C Compiler}, booktitle = {4th IEEE Workshop on Embedded Systems for Real-Time Multimedia (ESTIMedia)}, year = {2006}, pages = {121-126}, address = {Seoul/Korea}, month = {oct}, keywords = {wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2006-estimedia_3.pdf}, confidential = {n}, abstract = {This paper presents techniques to integrate worst-case execution time \textem{(WCET)} data into a compiler. Currently, a tight integration of WCET into compilers is strongly desired, but only some ad-hoc approaches were reported currently. Previous work mainly used self-written WCET estimators with limited functionality and preciseness during compilation. A very tight integration of a high quality WCET analyzer into a compiler was not yet achieved. This work is the first to present such a tight coupling between a compiler and the WCET analyzer aiT. This is done by automatically translating the assembly-like contents of the compiler's low-level format \textem{(LLIR)} to aiT's exchange format CRL2. Additionally, the results produced by aiT are automatically collected and re-imported into the compiler infrastructure. The work described in this paper is smoothly integrated into a C compiler for the Infineon TriCore processor. It opens up new possibilities for the design of WCET-aware optimizations in the future. The concepts for extending the compiler structure are kept very general so that they are not limited to WCET information. Rather, it is possible to use our concepts also for multi-objective optimization of e.g. best-case execution time \textem{(BCET)} or energy dissipation.}, }This paper presents techniques to integrate worst-case execution time (WCET) data into a compiler. Currently, a tight integration of WCET into compilers is strongly desired, but only some ad-hoc approaches were reported currently. Previous work mainly used self-written WCET estimators with limited functionality and preciseness during compilation. A very tight integration of a high quality WCET analyzer into a compiler was not yet achieved. This work is the first to present such a tight coupling between a compiler and the WCET analyzer aiT. This is done by automatically translating the assembly-like contents of the compiler's low-level format (LLIR) to aiT's exchange format CRL2. Additionally, the results produced by aiT are automatically collected and re-imported into the compiler infrastructure. The work described in this paper is smoothly integrated into a C compiler for the Infineon TriCore processor. It opens up new possibilities for the design of WCET-aware optimizations in the future. The concepts for extending the compiler structure are kept very general so that they are not limited to WCET information. Rather, it is possible to use our concepts also for multi-objective optimization of e.g. best-case execution time (BCET) or energy dissipation. Heiko Falk.Control Flow driven Code Hoisting at the Source Code Level. In Optimizations for DSP and Embedded SystemsSan Jose/United States, March 2005[BibTeX][PDF][Abstract]@inproceedings { falk:05:odes, author = {Falk, Heiko}, title = {Control Flow driven Code Hoisting at the Source Code Level}, booktitle = {Optimizations for DSP and Embedded Systems}, year = {2005}, address = {San Jose/United States}, month = {mar}, keywords = {sco}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2005-odes.pdf}, confidential = {n}, abstract = {This paper presents a novel source code optimization technique called advanced code hoisting. It aims at moving portions of code from inner loops to outer ones. In contrast to existing code motion techniques, this is done under consideration of control flow aspects. Depending on the conditions of \textem{if}-statements, moving an expression can lead to an increased number of executions of this expression. This paper contains formal descriptions of the polyhedral models used for control flow analysis so as to suppress a code motion in such a situation. Due to the inherent portability of source code transformations, a very detailed benchmarking using 8 different processors was performed. The application of our implemented techniques to real-life multimedia benchmarks leads to average speed-ups of 25.5\%-52\% and energy savings of 33.4\%-74.5\%. Furthermore, advanced code hoisting leads to improved pipeline and cache behavior and smaller code sizes.}, }This paper presents a novel source code optimization technique called advanced code hoisting. It aims at moving portions of code from inner loops to outer ones. In contrast to existing code motion techniques, this is done under consideration of control flow aspects. Depending on the conditions of if-statements, moving an expression can lead to an increased number of executions of this expression. This paper contains formal descriptions of the polyhedral models used for control flow analysis so as to suppress a code motion in such a situation. Due to the inherent portability of source code transformations, a very detailed benchmarking using 8 different processors was performed. The application of our implemented techniques to real-life multimedia benchmarks leads to average speed-ups of 25.5%-52% and energy savings of 33.4%-74.5%. Furthermore, advanced code hoisting leads to improved pipeline and cache behavior and smaller code sizes. Lars Wehmeyer and Peter Marwedel.Influence of Memory Hierarchies on Predictability for Time Constrained Embedded Software. In Design Automation and Test in Europe (DATE)Munich, Germany, March 2005[BibTeX][PDF][Abstract]@inproceedings { wehm:05:date, author = {Wehmeyer, Lars and Marwedel, Peter}, title = {Influence of Memory Hierarchies on Predictability for Time Constrained Embedded Software}, booktitle = {Design Automation and Test in Europe (DATE)}, year = {2005}, address = {Munich, Germany}, month = {mar}, keywords = {wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2005-date.pdf}, confidential = {n}, abstract = {Safety-critical embedded systems having to meet real-time constraints are expected to be highly predictable in order to guarantee at design time that certain timing deadlines will always be met. This requirement usually prevents designers from utilizing caches due to their highly dynamic, thus hardly predictable behavior. The integration of scratchpad memories represents an alternative approach which allows the system to benefit from a performance gain comparable to that of caches while at the same time maintaining predictability. In this work, we compare the impact of scratchpad memories and caches on worst case execution time (WCET) analysis results. We show that caches, despite requiring complex techniques, can have a negative impact on the predicted WCET, while the estimated WCET for scratchpad memories scales with the achieved performance gain at no extra analysis cost.}, }Safety-critical embedded systems having to meet real-time constraints are expected to be highly predictable in order to guarantee at design time that certain timing deadlines will always be met. This requirement usually prevents designers from utilizing caches due to their highly dynamic, thus hardly predictable behavior. The integration of scratchpad memories represents an alternative approach which allows the system to benefit from a performance gain comparable to that of caches while at the same time maintaining predictability. In this work, we compare the impact of scratchpad memories and caches on worst case execution time (WCET) analysis results. We show that caches, despite requiring complex techniques, can have a negative impact on the predicted WCET, while the estimated WCET for scratchpad memories scales with the achieved performance gain at no extra analysis cost. M. Engel and B. Freisleben.Supporting Autonomic Computing Functionality via Dynamic Operating System Kernel Aspects. In Proceedings of the Fourth International Conference on Aspect Oriented Software Development, Chicago, USA, pages 51-62 2005[BibTeX]@inproceedings { engel:05:aosd, author = {Engel, M. and Freisleben, B.}, title = {Supporting Autonomic Computing Functionality via Dynamic Operating System Kernel Aspects}, booktitle = {Proceedings of the Fourth International Conference on Aspect Oriented Software Development, Chicago, USA}, year = {2005}, pages = {51-62}, publisher = {ACM Press}, confidential = {n}, } Peter Marwedel, Manish Verma and Lars Wehmeyer.Compiler optimizations improving the processor/memory interface. In Workshop on Optimizing Compiler Assisted SoC Assembly (OCASA) September 2005[BibTeX]@inproceedings { marw:05:ocasa, author = {Marwedel, Peter and Verma, Manish and Wehmeyer, Lars}, title = {Compiler optimizations improving the processor/memory interface}, booktitle = {Workshop on Optimizing Compiler Assisted SoC Assembly (OCASA)}, year = {2005}, month = {sep}, confidential = {n}, } Manish Verma and Peter Marwedel.Memory Optimization Techniques for Low-Power Embedded Processors. In IFIP VIVA Workshop - Fundamentals and Methods for Low-Power Information ProcessingBonn, Germany, September 2005[BibTeX][PDF][Abstract]@inproceedings { verma:05:viva, author = {Verma, Manish and Marwedel, Peter}, title = {Memory Optimization Techniques for Low-Power Embedded Processors}, booktitle = {IFIP VIVA Workshop - Fundamentals and Methods for Low-Power Information Processing}, year = {2005}, address = {Bonn, Germany}, month = {sep}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2005-viva.pdf}, confidential = {n}, abstract = {Power consumption is an important design issue for contemporary portable embedded devices. It is known that the next generation of portable devices will feature faster processors and larger memories, both of which require high operational power. %This is expected to Memory subsystem has already been identified as the energy bottleneck of the entire system. Consequently, memory hierarchies are being constructed to reduce the memory subsystem's energy dissipation. Caches and scratchpad memories represent two contrasting memory architectures. Scratchpads are both area and power efficient than caches. However, they require explicit support from the compiler for managing their contents. In this work, we present three approaches for the prudent utilization of the scratchpad memory of an ARM7 processor and of a M5 DSP based system. The first approach is based on the following observations. Firstly, a small memory requires less energy per access than that by a large memory. Secondly, applications in general consist of small and frequently accessed arrays and large but infrequently accessed arrays. Consequently, the approach partitions the large scratchpad into several small scratchpads. The arrays are also statically mapped such that the small arrays are mapped to small and energy efficient scratchpads. The approach leads to average energy savings of 52\% and 35\% in the data memory subsystem of the ARM7 and the M5 DSP, respectively. The second approach utilizes the scratchpad as an instruction buffer in a cache based memory hierarchy. The approach models the cache as a conflict graph and assigns instructions to the scratchpad. The objective is to minimize the energy consumption of the system while preserving the predictable behavior of the memory hierarchy. The approach results in an average energy saving of 21\% against the above approach for the ARM7 based system. The last approach optimizes the energy consumption of the system by overlaying memory objects (\textem{i.e.} code segments and data elements) on to the scratchpad. Memory objects with non-conflicting life-times are assigned to the same location on the scratchpad. This improves the scratchpad utilization, however, it requires copying memory objects on and off the scratchpad during the execution of the application. Average energy reductions of 34\% and 33\% are reported for the ARM7 and the M5 DSP based systems, respectively.}, }Power consumption is an important design issue for contemporary portable embedded devices. It is known that the next generation of portable devices will feature faster processors and larger memories, both of which require high operational power. %This is expected to Memory subsystem has already been identified as the energy bottleneck of the entire system. Consequently, memory hierarchies are being constructed to reduce the memory subsystem's energy dissipation. Caches and scratchpad memories represent two contrasting memory architectures. Scratchpads are both area and power efficient than caches. However, they require explicit support from the compiler for managing their contents. In this work, we present three approaches for the prudent utilization of the scratchpad memory of an ARM7 processor and of a M5 DSP based system. The first approach is based on the following observations. Firstly, a small memory requires less energy per access than that by a large memory. Secondly, applications in general consist of small and frequently accessed arrays and large but infrequently accessed arrays. Consequently, the approach partitions the large scratchpad into several small scratchpads. The arrays are also statically mapped such that the small arrays are mapped to small and energy efficient scratchpads. The approach leads to average energy savings of 52% and 35% in the data memory subsystem of the ARM7 and the M5 DSP, respectively. The second approach utilizes the scratchpad as an instruction buffer in a cache based memory hierarchy. The approach models the cache as a conflict graph and assigns instructions to the scratchpad. The objective is to minimize the energy consumption of the system while preserving the predictable behavior of the memory hierarchy. The approach results in an average energy saving of 21% against the above approach for the ARM7 based system. The last approach optimizes the energy consumption of the system by overlaying memory objects (i.e. code segments and data elements) on to the scratchpad. Memory objects with non-conflicting life-times are assigned to the same location on the scratchpad. This improves the scratchpad utilization, however, it requires copying memory objects on and off the scratchpad during the execution of the application. Average energy reductions of 34% and 33% are reported for the ARM7 and the M5 DSP based systems, respectively. M. Engel, M. Mezini and B. Freisleben.Creating a Component-Based Multi-Server OS From Existing Source Code Using Aspect-Oriented Programming. In Proceedings of ICCCP'05 2005[BibTeX]@inproceedings { engel:05:icccp, author = {Engel, M. and Mezini, M. and Freisleben, B.}, title = {Creating a Component-Based Multi-Server OS From Existing Source Code Using Aspect-Oriented Programming}, booktitle = {Proceedings of ICCCP'05}, year = {2005}, publisher = {IEEE Computer Society Press}, confidential = {n}, } M. Engel and B. Freisleben.Using a Low-Level Virtual Machine to Improve Dynamic Aspect Support in Operating System Kernels. In Proceedings of the AOSD ACPIS Workshop 2005, pages 1-6 2005[BibTeX]@inproceedings { engel:05:acp4is, author = {Engel, M. and Freisleben, B.}, title = {Using a Low-Level Virtual Machine to Improve Dynamic Aspect Support in Operating System Kernels}, booktitle = {Proceedings of the AOSD ACPIS Workshop 2005}, year = {2005}, pages = {1-6}, publisher = {ACM Press}, confidential = {n}, } Peter Marwedel.Towards laying common grounds for embedded system design education. In Workshop on Embedded Systems Education (WESE) 2005[BibTeX][PDF][Abstract]@inproceedings { marwedel:05:wese, author = {Marwedel, Peter}, title = {Towards laying common grounds for embedded system design education}, booktitle = {Workshop on Embedded Systems Education (WESE)}, year = {2005}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2005-wese.pdf}, confidential = {n}, abstract = {In this paper, we propose to introduce a common introductory course for embedded system education. The course puts the different areas of embedded system design into perspective and avoids an early over-specialization. Also, it motivates the students for attending more advanced theoretical courses. The content, the structure and the prerequisites of such a course are outlined. The course requires a basic understanding of computer hardware and software and can typically be taught in the second or third year.}, }In this paper, we propose to introduce a common introductory course for embedded system education. The course puts the different areas of embedded system design into perspective and avoids an early over-specialization. Also, it motivates the students for attending more advanced theoretical courses. The content, the structure and the prerequisites of such a course are outlined. The course requires a basic understanding of computer hardware and software and can typically be taught in the second or third year. M. Engel and B. Freisleben.Autonomic Network Services on a Microkernel. In Proceedings of EUROCON, Belgrade, Serbia, pages 636-639 2005[BibTeX]@inproceedings { engel:06:eurocon1, author = {Engel, M. and Freisleben, B.}, title = {Autonomic Network Services on a Microkernel}, booktitle = {Proceedings of EUROCON, Belgrade, Serbia}, year = {2005}, pages = {636-639}, publisher = {IEEE Computer Society Press}, confidential = {n}, } M. Engel and B. Freisleben.Dynamic Aspect Support for Native Code. In Proceedings of EUROCON, Belgrade, Serbia, pages 732-735 2005[BibTeX]@inproceedings { engel:06:eurocon2, author = {Engel, M. and Freisleben, B.}, title = {Dynamic Aspect Support for Native Code}, booktitle = {Proceedings of EUROCON, Belgrade, Serbia}, year = {2005}, pages = {732-735}, publisher = {IEEE Computer Society Press}, confidential = {n}, } Manish Verma, Klaus Petzold, Lars Wehmeyer, Heiko Falk and Peter Marwedel.Scratchpad Sharing Strategies for Multiprocess Embedded Systems: A First Approach. In IEEE 3rd Workshop on Embedded System for Real-Time Multimedia (ESTIMedia), pages 115-120Jersey City, USA, September 2005[BibTeX][PDF][Abstract]@inproceedings { verma:05:estimedia, author = {Verma, Manish and Petzold, Klaus and Wehmeyer, Lars and Falk, Heiko and Marwedel, Peter}, title = {Scratchpad Sharing Strategies for Multiprocess Embedded Systems: A First Approach}, booktitle = {IEEE 3rd Workshop on Embedded System for Real-Time Multimedia (ESTIMedia)}, year = {2005}, pages = {115-120}, address = {Jersey City, USA}, month = {sep}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2005-estimedia.pdf}, confidential = {n}, abstract = {Portable embedded systems require diligence in managing their energy consumption. Thus, power efficient processors coupled with onchip memories (e.g. caches, scratchpads) are the base of today's portable devices. Scratchpads are more energy efficient than caches but require software support for their utilization. Portable devices' applications consist of multiple processes for different tasks. However, all the previous scratchpad allocation approaches only consider single process applications. In this paper, we propose a set of optimal strategies to reduce the energy consumption of applications by sharing the scratchpad among multiple processes. The strategies assign both code and data elements to the scratchpad and result in average total energy reductions of 9\%-20\% against a published single process approach. Furthermore, the strategies generate Pareto-optimal curves for the applications allowing design time exploration of energy/scratchpad size tradeoffs.}, }Portable embedded systems require diligence in managing their energy consumption. Thus, power efficient processors coupled with onchip memories (e.g. caches, scratchpads) are the base of today's portable devices. Scratchpads are more energy efficient than caches but require software support for their utilization. Portable devices' applications consist of multiple processes for different tasks. However, all the previous scratchpad allocation approaches only consider single process applications. In this paper, we propose a set of optimal strategies to reduce the energy consumption of applications by sharing the scratchpad among multiple processes. The strategies assign both code and data elements to the scratchpad and result in average total energy reductions of 9%-20% against a published single process approach. Furthermore, the strategies generate Pareto-optimal curves for the applications allowing design time exploration of energy/scratchpad size tradeoffs. Lars Wehmeyer and Peter Marwedel.Influence of Onchip Scratchpad Memories on WCET prediction. In Proceedings of the 4th International Workshop on Worst-Case Execution Time (WCET) AnalysisCatania, Sicily, Italy, June 2004[BibTeX][PDF][Abstract]@inproceedings { wehm:04:wcet, author = {Wehmeyer, Lars and Marwedel, Peter}, title = {Influence of Onchip Scratchpad Memories on WCET prediction}, booktitle = {Proceedings of the 4th International Workshop on Worst-Case Execution Time (WCET) Analysis}, year = {2004}, address = {Catania, Sicily, Italy}, month = {jun}, keywords = {wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2004-WCET.pdf}, confidential = {n}, abstract = {In contrast to standard PCs and many high-performance computer systems, systems that have to meet real-time requirements usually do not feature caches, since caches primarily improve the average case performance, whereas their impact on WCET is generally hard to predict. Especially in embedded systems, scratchpad memories have become popular. Since these small, fast memories can be controlled by the programmer or the compiler, their behavior is perfectly predictable. In this paper, we study for the first time the impact of scratchpad memories on worst case execution time (WCET) prediction. Our results indicate that scratchpads can significantly improve WCET at no extra analysis cost.}, }In contrast to standard PCs and many high-performance computer systems, systems that have to meet real-time requirements usually do not feature caches, since caches primarily improve the average case performance, whereas their impact on WCET is generally hard to predict. Especially in embedded systems, scratchpad memories have become popular. Since these small, fast memories can be controlled by the programmer or the compiler, their behavior is perfectly predictable. In this paper, we study for the first time the impact of scratchpad memories on worst case execution time (WCET) prediction. Our results indicate that scratchpads can significantly improve WCET at no extra analysis cost. Lars Wehmeyer, Urs Helmig and Peter Marwedel.Compiler-optimized Usage of Partitioned Memories. In Proceedings of the 3rd Workshop on Memory Performance Issues (WMPI2004)Munich, Germany, June 2004[BibTeX][PDF][Abstract]@inproceedings { wehm:04:wmpi, author = {Wehmeyer, Lars and Helmig, Urs and Marwedel, Peter}, title = {Compiler-optimized Usage of Partitioned Memories}, booktitle = {Proceedings of the 3rd Workshop on Memory Performance Issues (WMPI2004)}, year = {2004}, address = {Munich, Germany}, month = {jun}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2004-WMPI.pdf}, confidential = {n}, abstract = {In order to meet the requirements concerning both performance and energy consumption in embedded systems, new memory architectures are being introduced. Beside the well-known use of caches in the memory hierarchy, processor cores today also include small onchip memories called scratchpad memories whose usage is not controlled by hardware, but rather by the programmer or the compiler. Techniques for utilization of these scratchpads have been known for some time. Some new processors provide more than one scratchpad, making it necessary to enhance the workflow such that this complex memory architecture can be efficiently utilized. In this work, we present an energy model and an ILP formulation to optimally assign memory objects to different partitions of scratchpad memories at compile time, achieving energy savings of up to 22\% compared to previous approaches.}, }In order to meet the requirements concerning both performance and energy consumption in embedded systems, new memory architectures are being introduced. Beside the well-known use of caches in the memory hierarchy, processor cores today also include small onchip memories called scratchpad memories whose usage is not controlled by hardware, but rather by the programmer or the compiler. Techniques for utilization of these scratchpads have been known for some time. Some new processors provide more than one scratchpad, making it necessary to enhance the workflow such that this complex memory architecture can be efficiently utilized. In this work, we present an energy model and an ILP formulation to optimally assign memory objects to different partitions of scratchpad memories at compile time, achieving energy savings of up to 22% compared to previous approaches. Manish Verma, Lars Wehmeyer and Peter Marwedel.Cache Aware Scratchpad Allocation. In DATEParis/France, February 2004[BibTeX][PDF][Abstract]@inproceedings { verma:04:date, author = {Verma, Manish and Wehmeyer, Lars and Marwedel, Peter}, title = {Cache Aware Scratchpad Allocation}, booktitle = {DATE}, year = {2004}, address = {Paris/France}, month = {feb}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2004-date.pdf}, confidential = {n}, abstract = {In the context of portable embedded systems, reducing energy is one of the prime objectives. Most high-end embedded microprocessors include onchip instruction and data caches, along with a small energy efficient scratchpad. Previous approaches for utilizing scratchpad did not consider caches and hence fail for the au courant architecture. In the presented work, we use the scratchpad for storing instructions and propose a generic Cache Aware Scratchpad Allocation (CASA) algorithm. We report an average reduction of 8-29\% in instruction memory energy consumption compared to a previously published technique for benchmarks from the Mediabench suite. The scratchpad in the presented architecture is similar to a preloaded loop cache. Comparing the energy consumption of our approach against preloaded loop caches, we report average energy savings of 20-44\%.}, }In the context of portable embedded systems, reducing energy is one of the prime objectives. Most high-end embedded microprocessors include onchip instruction and data caches, along with a small energy efficient scratchpad. Previous approaches for utilizing scratchpad did not consider caches and hence fail for the au courant architecture. In the presented work, we use the scratchpad for storing instructions and propose a generic Cache Aware Scratchpad Allocation (CASA) algorithm. We report an average reduction of 8-29% in instruction memory energy consumption compared to a previously published technique for benchmarks from the Mediabench suite. The scratchpad in the presented architecture is similar to a preloaded loop cache. Comparing the energy consumption of our approach against preloaded loop caches, we report average energy savings of 20-44%. Michael Engel and Guido Germano.CITY at home: Monte Carlo option pricing distributed on personal computers. In Proc. of the 10th International Conference on Computing in Economics and Finance of the Society of Computational Economics 2004[BibTeX]@inproceedings { engel:04:iccef, author = {Engel, Michael and Germano, Guido}, title = {CITY at home: Monte Carlo option pricing distributed on personal computers}, booktitle = {Proc. of the 10th International Conference on Computing in Economics and Finance of the Society of Computational Economics}, year = {2004}, confidential = {n}, } Heiko Falk and Manish Verma.Combined Data Partitioning and Loop Nest Splitting for Energy Consumption Minimization. In SCOPES, pages 137-151Amsterdam/The Netherlands, September 2004[BibTeX][PDF][Abstract]@inproceedings { falk:04:scopes, author = {Falk, Heiko and Verma, Manish}, title = {Combined Data Partitioning and Loop Nest Splitting for Energy Consumption Minimization}, booktitle = {SCOPES}, year = {2004}, pages = {137-151}, address = {Amsterdam/The Netherlands}, month = {sep}, keywords = {sco}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2004-scopes.pdf}, confidential = {n}, abstract = {For mobile embedded systems, the energy consumption is a limiting factor because of today's battery capacities. Besides the processor, memory accesses consume a high amount of energy. The use of additional less power hungry memories like caches or scratchpads is thus common. This paper presents a combined approach for energy consumption minimization consisting of two complementary and phase-coupled optimizations, viz. data partitioning and loop nest splitting. In a first step, data partitioning partitions large arrays found in typical embedded software into smaller ones which are placed onto an on-chip scratchpad memory. Although being effective w.r.t. energy dissipation, this optimization adds overhead to the code since the correct part of a partitioned array has to be selected at runtime. Therefore, the control flow is optimized as a second step in our framework. In this phase, loop nests containing \textem{if}-statements are split using genetic algorithms leading to minimized \textem{if}-statement executions. However, loop nest splitting leads to an increase in code size and can potentially annul the program layout achieved by the first step. Consequently, the proposed approach iteratively applies these optimizations till a local optimum is found. The proposed framework of combined memory and control flow optimization leads to considerable energy savings for a representative set of typical embedded software routines. Using an accurate energy model for the ARM7 processor, energy savings between 20.3\% and 43.3\% were measured.}, }For mobile embedded systems, the energy consumption is a limiting factor because of today's battery capacities. Besides the processor, memory accesses consume a high amount of energy. The use of additional less power hungry memories like caches or scratchpads is thus common. This paper presents a combined approach for energy consumption minimization consisting of two complementary and phase-coupled optimizations, viz. data partitioning and loop nest splitting. In a first step, data partitioning partitions large arrays found in typical embedded software into smaller ones which are placed onto an on-chip scratchpad memory. Although being effective w.r.t. energy dissipation, this optimization adds overhead to the code since the correct part of a partitioned array has to be selected at runtime. Therefore, the control flow is optimized as a second step in our framework. In this phase, loop nests containing if-statements are split using genetic algorithms leading to minimized if-statement executions. However, loop nest splitting leads to an increase in code size and can potentially annul the program layout achieved by the first step. Consequently, the proposed approach iteratively applies these optimizations till a local optimum is found. The proposed framework of combined memory and control flow optimization leads to considerable energy savings for a representative set of typical embedded software routines. Using an accurate energy model for the ARM7 processor, energy savings between 20.3% and 43.3% were measured. Markus Lorenz and Peter Marwedel.Phase Coupled Code Generation for DSPs Using a Genetic Algorithm. In DATE, pages 1270-1275 June 2004[BibTeX][PDF][Abstract]@inproceedings { lorenz:04:date, author = {Lorenz, Markus and Marwedel, Peter}, title = {Phase Coupled Code Generation for DSPs Using a Genetic Algorithm}, booktitle = {DATE}, year = {2004}, pages = {1270-1275}, month = {jun}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2004-date-lorenz.pdf}, confidential = {n}, abstract = {The growing use of digital signal processors (DSPs) in embedded systems necessitates the use of optimizing compilers supporting special hardware features. Due to the irregular architectures present in todays DSPs there is a need of compilers which are capable of performing a phase coupling of the highly interdependent code generation subtasks and a graph based code selection. In this paper we present a code generator which performs a graph based code selection and a complete phase coupling of code selection, instruction scheduling (including compaction) and register allocation. In addition, our code generator takes into account effects of the subsequent address code generation phase. In order to solve the phase coupling problem and to handle the problem complexity, our code generator is based on a genetic algorithm. Experimental results for several benchmarks and an MP3 application for two DSPs show the effectiveness and the retargetability of our approach. Using the presented techniques, the number of execution cycles is reduced by 51\% on average for the M3-DSP and by 38\% on average for the ADSP2100 compared to standard techniques.}, }The growing use of digital signal processors (DSPs) in embedded systems necessitates the use of optimizing compilers supporting special hardware features. Due to the irregular architectures present in todays DSPs there is a need of compilers which are capable of performing a phase coupling of the highly interdependent code generation subtasks and a graph based code selection. In this paper we present a code generator which performs a graph based code selection and a complete phase coupling of code selection, instruction scheduling (including compaction) and register allocation. In addition, our code generator takes into account effects of the subsequent address code generation phase. In order to solve the phase coupling problem and to handle the problem complexity, our code generator is based on a genetic algorithm. Experimental results for several benchmarks and an MP3 application for two DSPs show the effectiveness and the retargetability of our approach. Using the presented techniques, the number of execution cycles is reduced by 51% on average for the M3-DSP and by 38% on average for the ADSP2100 compared to standard techniques. Peter Marwedel, Lars Wehmeyer, Manish Verma, Stefan Steinke and Urs Helmig.Fast, predictable and low energy memory references through architecture-aware compilation. In ASPDAC, pages 4-11 January 2004[BibTeX][PDF][Abstract]@inproceedings { marw:04:aspdac, author = {Marwedel, Peter and Wehmeyer, Lars and Verma, Manish and Steinke, Stefan and Helmig, Urs}, title = {Fast, predictable and low energy memory references through architecture-aware compilation}, booktitle = {ASPDAC}, year = {2004}, pages = {4-11}, month = {jan}, keywords = {wcet}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2004-aspdac-spm.pdf}, confidential = {n}, abstract = {The design of future high-performance embedded systems is hampered by two problems: First, the required hardware needs more energy than is available from batteries. Second, current cache-based approaches for bridging the increasing speed gap between processors and memories cannot guarantee predictable real-time behavior. A contribution to solving both problems is made in this paper which describes a comprehensive set of algorithms that can be applied at design time in order to maximally exploit scratch pad memories (SPMs). We show that both the energy consumption as well as the computed worst case execution time (WCET) can be reduced by up to to 80\% and 48\%, respectively, by establishing a strong link between the memory architecture and the compiler.}, }The design of future high-performance embedded systems is hampered by two problems: First, the required hardware needs more energy than is available from batteries. Second, current cache-based approaches for bridging the increasing speed gap between processors and memories cannot guarantee predictable real-time behavior. A contribution to solving both problems is made in this paper which describes a comprehensive set of algorithms that can be applied at design time in order to maximally exploit scratch pad memories (SPMs). We show that both the energy consumption as well as the computed worst case execution time (WCET) can be reduced by up to to 80% and 48%, respectively, by establishing a strong link between the memory architecture and the compiler. Manish Verma, Lars Wehmeyer and Peter Marwedel.Dynamic Overlay of Scratchpad Memory for Energy Minimization. In International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS)Stockholm, Sweden, September 2004[BibTeX][PDF][Abstract]@inproceedings { verma:04:codes, author = {Verma, Manish and Wehmeyer, Lars and Marwedel, Peter}, title = {Dynamic Overlay of Scratchpad Memory for Energy Minimization}, booktitle = {International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS)}, year = {2004}, address = {Stockholm, Sweden}, month = {sep}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2004-isss.pdf}, confidential = {n}, abstract = {The memory subsystem accounts for a significant portion of the aggregate energy budget of contemporary embedded systems. Moreover, there exists a large potential for optimizing the energy consumption of the memory subsystem. Consequently, novel memories as well as novel algorithms for their efficient utilization are being designed. Scratchpads are known to perform better than caches in terms of power, performance, area and predictability. However, unlike caches they depend upon software allocation techniques for their utilization. In this paper, we present an allocation technique which analyzes the application and inserts instructions to dynamically copy both code segments and variables onto the scratchpad at runtime. We demonstrate that the problem of dynamically overlaying scratchpad is an extension of the Global Register Allocation problem. The overlay problem is solved optimally using ILP formulation techniques. Our approach improves upon the only previously known allocation technique for statically allocating both variables and code segments onto the scratchpad. Experiments report an average reduction of 34\% and 18\% in the energy consumption and the runtime of the applications, respectively. A minimal increase in code size is also reported.}, }The memory subsystem accounts for a significant portion of the aggregate energy budget of contemporary embedded systems. Moreover, there exists a large potential for optimizing the energy consumption of the memory subsystem. Consequently, novel memories as well as novel algorithms for their efficient utilization are being designed. Scratchpads are known to perform better than caches in terms of power, performance, area and predictability. However, unlike caches they depend upon software allocation techniques for their utilization. In this paper, we present an allocation technique which analyzes the application and inserts instructions to dynamically copy both code segments and variables onto the scratchpad at runtime. We demonstrate that the problem of dynamically overlaying scratchpad is an extension of the Global Register Allocation problem. The overlay problem is solved optimally using ILP formulation techniques. Our approach improves upon the only previously known allocation technique for statically allocating both variables and code segments onto the scratchpad. Experiments report an average reduction of 34% and 18% in the energy consumption and the runtime of the applications, respectively. A minimal increase in code size is also reported. M. Engel, B. Freisleben, M. Smith and S. Hanemann.Wireless Ad-Hoc Network Emulation Using Microkernel-Based Virtual Linux Systems. In Proceedings of the 5th EUROSIM Congress on Modeling and Simulation, Marne la Vallee, France, pages 198-203 2004[BibTeX]@inproceedings { engel:04:eurosim, author = {Engel, M. and Freisleben, B. and Smith, M. and Hanemann, S.}, title = {Wireless Ad-Hoc Network Emulation Using Microkernel-Based Virtual Linux Systems}, booktitle = {Proceedings of the 5th EUROSIM Congress on Modeling and Simulation, Marne la Vallee, France}, year = {2004}, pages = {198-203}, publisher = {EUROSIM Publishers}, confidential = {n}, } Markus Lorenz, Peter Marwedel, Thorsten Dräger, Gerhard Fettweis and Rainer Leupers.Compiler based Exploration of DSP Energy Savings by SIMD Operations. In ASPDAC, pages 839-842 June 2004[BibTeX][PDF][Abstract]@inproceedings { lorenz:04:aspdac, author = {Lorenz, Markus and Marwedel, Peter and Dr\"ager, Thorsten and Fettweis, Gerhard and Leupers, Rainer}, title = {Compiler based Exploration of DSP Energy Savings by SIMD Operations}, booktitle = {ASPDAC}, year = {2004}, pages = {839-842}, month = {jun}, file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2004-aspdac-lorenz.pdf}, confidential = {n}, abstract = {The growing use of digital signal processors (DSPs) in embedded systems necessitates the use of optimizing compilers supporting their special architecture features. Besides the irregular DSP architectures for reducing chip size and energy consumption, single instruction multiple data (SIMD) functionality is frequently integrated with the intention of performance improvement. In order to get an energy-efficient system consisting of processor and compiler, it is necessary to optimize hardware as well as software. It is not obvious that SIMD operations can save any energy: if n operations are executed in parallel, each of them might consume the same amount of energy as if there were executed sequentially. Up to now, no work has been done to investigate the influence of compiler generated code containing SIMD operations w.r.t. the energy consumption. This paper deals with the exploration of the energy saving potential of SIMD operations for a DSP by using a generic compilation framework including an integrated instruction level energy cost model for our target architecture. Effects of SIMD operations on the energy consumption are shown for several benchmarks and an MP3 application.}, }The growing use of digital signal processors (DSPs) in embedded systems necessitates the use of optimizing compilers supporting their special architecture features. Besides the irregular DSP architectures for reducing chip size and energy consumption, single instruction multiple data (SIMD) functionality is frequently integrated with the intention of performance improvement. In order to get an energy-efficient system consisting of processor and compiler, it is necessary to optimize hardware as well as software. It is not obvious that SIMD operations can save any energy: if n operations are executed in parallel, each of them might consume the same amount of energy as if there were executed sequentially. Up to now, no work has been done to investigate the influence of compiler generated code containing SIMD operations w.r.t. the energy consumption. This paper deals with the exploration of the energy saving potential of SIMD operations for a DSP by using a generic compilation framework including an integrated instruction level energy cost model for our target architecture. Effects of SIMD op