| Helena Kotthaus, Lea Schönberger, Andreas Lang, Jian-Jia Chen and Peter Marwedel. Can Flexible Multi-Core Scheduling Help to Execute Machine Learning Algorithms Resource-Efficiently?. In 22nd International Workshop on Software and Compilers for Embedded Systems, pages 59--62 2019 [BibTeX][Link]@inproceedings { kotthaus/2019b,
author = {Kotthaus, Helena and Sch\"onberger, Lea and Lang, Andreas and Chen, Jian-Jia and Marwedel, Peter},
title = {Can Flexible Multi-Core Scheduling Help to Execute Machine Learning Algorithms Resource-Efficiently?},
booktitle = {22nd International Workshop on Software and Compilers for Embedded Systems},
year = {2019},
series = {SCOPES '19},
pages = {59--62},
publisher = {ACM},
url = {https://dl.acm.org/citation.cfm?id=3323986},
keywords = {Lea},
confidential = {n},
} |
| Helena Kotthaus, Andreas Lang and Peter Marwedel. Optimizing Parallel R Programs via Dynamic Scheduling Strategies. In Abstract Booklet of the International R User Conference (UseR!) Brisbane, Australia, July 2018 [BibTeX][Link]@inproceedings { kotthaus/2018a,
author = {Kotthaus, Helena and Lang, Andreas and Marwedel, Peter},
title = {Optimizing Parallel R Programs via Dynamic Scheduling Strategies},
booktitle = {Abstract Booklet of the International R User Conference (UseR!)},
year = {2018},
address = {Brisbane, Australia},
month = {July},
url = {https://stat.ethz.ch/R-manual/R-devel/library/parallel/doc/parallel.pdf},
confidential = {n},
} |
| Olaf Neugebauer, Peter Marwedel, Roland Kühn and Michael Engel. Quality Evaluation Strategies for Approximate Computing in Embedded Systems. In Technological Innovation for Smart Systems: 8th IFIP WG 5.5/SOCOLNET Advanced Doctoral Conference on Computing, Electrical and Industrial Systems, DoCEIS 2017, Costa de Caparica, Portugal, May 3-5, 2017, Proceedings, pages 203--210 2017 [BibTeX][Link]@inproceedings { Neugebauer2017,
author = {Neugebauer, Olaf and Marwedel, Peter and K{\"u}hn, Roland and Engel, Michael},
title = {Quality Evaluation Strategies for Approximate Computing in Embedded Systems},
booktitle = {Technological Innovation for Smart Systems: 8th IFIP WG 5.5/SOCOLNET Advanced Doctoral Conference on Computing, Electrical and Industrial Systems, DoCEIS 2017, Costa de Caparica, Portugal, May 3-5, 2017, Proceedings},
year = {2017},
editor = {Camarinha-Matos, Luis M. and Parreira-Rocha, Mafalda and Ramezani, Javaneh},
pages = {203--210},
publisher = {Springer International Publishing},
url = {http://dx.doi.org/10.1007/978-3-319-56077-9_19},
confidential = {n},
} |
| Helena Kotthaus, Jakob Richter, Andreas Lang, Janek Thomas, Bernd Bischl, Peter Marwedel, Jörg Rahnenführer and Michel Lang. RAMBO: Resource-Aware Model-Based Optimization with Scheduling for Heterogeneous Runtimes and a Comparison with Asynchronous Model-Based Optimization. In Proceedings of the 11th International Conference: Learning and Intelligent Optimization (LION 11), pages 180--195 2017 [BibTeX][Link][Abstract]@inproceedings { kotthaus/2017a,
author = {Kotthaus, Helena and Richter, Jakob and Lang, Andreas and Thomas, Janek and Bischl, Bernd and Marwedel, Peter and Rahnenf\"uhrer, J\"org and Lang, Michel},
title = {RAMBO: Resource-Aware Model-Based Optimization with Scheduling for Heterogeneous Runtimes and a Comparison with Asynchronous Model-Based Optimization},
booktitle = {Proceedings of the 11th International Conference: Learning and Intelligent Optimization (LION 11)},
year = {2017},
pages = {180--195},
publisher = {Lecture Notes in Computer Science, Springer},
url = {http://www.springer.com/de/book/9783319694030},
confidential = {n},
abstract = {Sequential model-based optimization is a popular technique for global optimization of expensive black-box functions. It uses a regression model to approximate the objective function and iteratively proposes new interesting points. Deviating from the original formulation, it is often indispensable to apply parallelization to speed up the computation. This is usually achieved by evaluating as many points per iteration as there are workers available. However, if runtimes of the objective function are heterogeneous, resources might be wasted by idle workers. Our new knapsack-based scheduling approach aims at increasing the effectiveness of parallel optimization by efficient resource utilization. Derived from an extra regression model we use runtime predictions of point evaluations to efficiently map evaluations to workers and reduce idling. We compare our approach to five established parallelization strategies on a set of continuous functions with heterogeneous runtimes. Our benchmark covers comparisons of synchronous and asynchronous model-based approaches and investigates the scalability.},
} Sequential model-based optimization is a popular technique for global optimization of expensive black-box functions. It uses a regression model to approximate the objective function and iteratively proposes new interesting points. Deviating from the original formulation, it is often indispensable to apply parallelization to speed up the computation. This is usually achieved by evaluating as many points per iteration as there are workers available. However, if runtimes of the objective function are heterogeneous, resources might be wasted by idle workers. Our new knapsack-based scheduling approach aims at increasing the effectiveness of parallel optimization by efficient resource utilization. Derived from an extra regression model we use runtime predictions of point evaluations to efficiently map evaluations to workers and reduce idling. We compare our approach to five established parallelization strategies on a set of continuous functions with heterogeneous runtimes. Our benchmark covers comparisons of synchronous and asynchronous model-based approaches and investigates the scalability.
|
| Helena Kotthaus, Andreas Lang, Olaf Neugebauer and Peter Marwedel. R goes Mobile: Efficient Scheduling for Parallel R Programs on Heterogeneous Embedded Systems. In Abstract Booklet of the International R User Conference (UseR!), pages 74 Brussels, Belgium, July 2017 [BibTeX][Link]@inproceedings { kotthaus/2017b,
author = {Kotthaus, Helena and Lang, Andreas and Neugebauer, Olaf and Marwedel, Peter},
title = {R goes Mobile: Efficient Scheduling for Parallel R Programs on Heterogeneous Embedded Systems},
booktitle = {Abstract Booklet of the International R User Conference (UseR!)},
year = {2017},
pages = {74},
address = {Brussels, Belgium},
month = {July},
url = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2017_user_kotthaus.pdf },
confidential = {n},
} |
| Kuan-Hsun Chen, Björn Bönninghoff, Jian-Jia Chen and Peter Marwedel. Compensate or Ignore? Meeting Control Robustness Requirements through Adaptive Soft-Error Handling. In Languages, Compilers, Tools and Theory for Embedded Systems (LCTES) Santa Barbara, CA, U.S.A., June 2016 [BibTeX][PDF][Link][Abstract]@inproceedings { Chenlctes2016,
author = {Chen, Kuan-Hsun and B\"onninghoff, Bj\"orn and Chen, Jian-Jia and Marwedel, Peter},
title = {Compensate or Ignore? Meeting Control Robustness Requirements through Adaptive Soft-Error Handling},
booktitle = {Languages, Compilers, Tools and Theory for Embedded Systems (LCTES)},
year = {2016},
address = {Santa Barbara, CA, U.S.A.},
month = {June},
organization = {ACM},
url = {http://dx.doi.org/10.1145/2907950.2907952},
keywords = {kuan},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2016-khchen-lctes.pdf},
confidential = {n},
abstract = {To avoid catastrophic events like unrecoverable system failures on mobile and embedded systems caused by soft-errors, software-
based error detection and compensation techniques have been proposed. Methods like error-correction codes or redundant execution can offer high flexibility and allow for application-specific fault-tolerance selection without the needs of special hardware supports. However, such software-based approaches may lead to system overload due to the execution time overhead. An adaptive deployment of such techniques to meet both application requirements and system constraints is desired. From our case study, we observe that a control task can tolerate limited errors with acceptable performance loss. Such tolerance can be modeled as a (m, k) constraint which requires at least m correct runs out of any k consecutive runs to be correct. In this paper, we discuss how a given (m, k) constraint can be satisfied by adopting patterns of task instances with individual error detection and compensation capabilities. We introduce static strategies and provide a formal feasibility analysis for validation. Furthermore, we develop an adaptive scheme that extends our initial approach with online awareness that increases efficiency while preserving analysis results. The effectiveness of our method is shown in a real-world case study as well as for synthesized task sets.},
} To avoid catastrophic events like unrecoverable system failures on mobile and embedded systems caused by soft-errors, software-
based error detection and compensation techniques have been proposed. Methods like error-correction codes or redundant execution can offer high flexibility and allow for application-specific fault-tolerance selection without the needs of special hardware supports. However, such software-based approaches may lead to system overload due to the execution time overhead. An adaptive deployment of such techniques to meet both application requirements and system constraints is desired. From our case study, we observe that a control task can tolerate limited errors with acceptable performance loss. Such tolerance can be modeled as a (m, k) constraint which requires at least m correct runs out of any k consecutive runs to be correct. In this paper, we discuss how a given (m, k) constraint can be satisfied by adopting patterns of task instances with individual error detection and compensation capabilities. We introduce static strategies and provide a formal feasibility analysis for validation. Furthermore, we develop an adaptive scheme that extends our initial approach with online awareness that increases efficiency while preserving analysis results. The effectiveness of our method is shown in a real-world case study as well as for synthesized task sets.
|
| Ingo Korb, Helena Kotthaus and Peter Marwedel. mmapcopy: Efficient Memory Footprint Reduction using Application Knowledge. In Proceedings of the 31st Annual ACM Symposium on Applied Computing Pisa, Italy, 2016 [BibTeX][PDF][Abstract]@inproceedings { korb:2016:sac,
author = {Korb, Ingo and Kotthaus, Helena and Marwedel, Peter},
title = {mmapcopy: Efficient Memory Footprint Reduction using Application Knowledge},
booktitle = {Proceedings of the 31st Annual ACM Symposium on Applied Computing },
year = {2016},
address = {Pisa, Italy},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2016-korb-mmapcopy.pdf},
confidential = {n},
abstract = {Memory requirements can be a limiting factor for programs dealing with large data structures. Especially interpreted programming languages that are used to deal with large vectors like R suffer from memory overhead when copying such data structures. Avoiding data duplication directly in the application can reduce the memory requirements. Alternatively, generic kernel-level memory reduction functionality like deduplication and compression can lower the amount of memory required, but they need to compensate for missing application knowledge by utilizing more CPU time, leading to excessive overhead. To allow new optimizations based
on the application’s knowledge about its own memory utilization, we propose to introduce a new system call. This system call uses the existing copy-on-write functionality of the Linux kernel to avoid duplicating memory when data is copied. Our experiments using real-world benchmarks written in the R language show that our approach can yield significant improvement in CPU time compared to Kernel Samepage Merging without compromising the amount of memory saved.
},
} Memory requirements can be a limiting factor for programs dealing with large data structures. Especially interpreted programming languages that are used to deal with large vectors like R suffer from memory overhead when copying such data structures. Avoiding data duplication directly in the application can reduce the memory requirements. Alternatively, generic kernel-level memory reduction functionality like deduplication and compression can lower the amount of memory required, but they need to compensate for missing application knowledge by utilizing more CPU time, leading to excessive overhead. To allow new optimizations based
on the application’s knowledge about its own memory utilization, we propose to introduce a new system call. This system call uses the existing copy-on-write functionality of the Linux kernel to avoid duplicating memory when data is copied. Our experiments using real-world benchmarks written in the R language show that our approach can yield significant improvement in CPU time compared to Kernel Samepage Merging without compromising the amount of memory saved.
|
| Jakob Richter, Helena Kotthaus, Bernd Bischl, Peter Marwedel, Jörg Rahnenführer and Michel Lang. Faster Model-Based Optimization through Resource-Aware Scheduling Strategies. In Proceedings of the 10th International Conference: Learning and Intelligent Optimization (LION 10). vol. 10079 of Lecture Notes in Computer Science., pages 267--273 2016 [BibTeX][PDF][Link][Abstract]@inproceedings { kotthaus/2016a,
author = {Richter, Jakob and Kotthaus, Helena and Bischl, Bernd and Marwedel, Peter and Rahnenf\"uhrer, J\"org and Lang, Michel},
title = {Faster Model-Based Optimization through Resource-Aware Scheduling Strategies},
booktitle = {Proceedings of the 10th International Conference: Learning and Intelligent Optimization (LION 10).},
year = {2016},
volume = {vol. 10079 of Lecture Notes in Computer Science.},
pages = {267--273},
publisher = {Springer},
url = {http://link.springer.com/chapter/10.1007/978-3-319-50349-3_22},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2016_kotthaus_lion10.pdf },
confidential = {n},
abstract = {We present a Resource-Aware Model-Based Optimization framework RAMBO that leads to efficient utilization of parallel computer architectures through resource-aware scheduling strategies. Conventional MBO fits a regression model on the set of already evaluated configurations and their observed performances to guide the search. Due to its inherent sequential nature, an efficient parallel variant can not directly be derived, as only the most promising configuration w.r.t. an infill criterion is evaluated in each iteration. This issue has been addressed by generalized infill criteria in order to propose multiple points simultaneously for parallel execution in each sequential step. However, these extensions in general neglect systematic runtime differences in the configuration space which often leads to underutilized systems. We estimate runtimes using an additional surrogate model to improve the scheduling and demonstrate that our framework approach already yields improved resource utilization on two exemplary classification tasks.},
} We present a Resource-Aware Model-Based Optimization framework RAMBO that leads to efficient utilization of parallel computer architectures through resource-aware scheduling strategies. Conventional MBO fits a regression model on the set of already evaluated configurations and their observed performances to guide the search. Due to its inherent sequential nature, an efficient parallel variant can not directly be derived, as only the most promising configuration w.r.t. an infill criterion is evaluated in each iteration. This issue has been addressed by generalized infill criteria in order to propose multiple points simultaneously for parallel execution in each sequential step. However, these extensions in general neglect systematic runtime differences in the configuration space which often leads to underutilized systems. We estimate runtimes using an additional surrogate model to improve the scheduling and demonstrate that our framework approach already yields improved resource utilization on two exemplary classification tasks.
|
| Helena Kotthaus, Jakob Richter, Andreas Lang, Michel Lang and Peter Marwedel. Resource-Aware Scheduling Strategies for Parallel Machine Learning R Programs through RAMBO. In Abstract Booklet of the International R User Conference (UseR!) 195 USA, Stanford, June 2016 [BibTeX][Link][Abstract]@inproceedings { kotthaus:2016b,
author = {Kotthaus, Helena and Richter, Jakob and Lang, Andreas and Lang, Michel and Marwedel, Peter},
title = {Resource-Aware Scheduling Strategies for Parallel Machine Learning R Programs through RAMBO},
booktitle = {Abstract Booklet of the International R User Conference (UseR!)},
year = {2016},
number = {195},
address = {USA, Stanford},
month = {June},
url = {http://user2016.org/files/abs-book.pdf},
confidential = {n},
abstract = {We present resource-aware scheduling strategies for parallel R programs leading to efficient utilization of parallel computer architectures by estimating resource demands. We concentrate on applications that consist of independent tasks.
The R programming language is increasingly used to process large data sets in parallel, which requires a high amount of resources. One important application is parameter tuning of machine learning algorithms where evaluations need to be executed in parallel to reduce runtime. Here, resource demands of tasks heavily vary depending on the algorithm configuration. Running such an application in a naive parallel way leads to inefficient resource utilization and thus to long runtimes. Therefore, the R package “parallel” offers a scheduling strategy, called “load balancing”.
It dynamically allocates tasks to worker processes. This option is recommended when tasks have widely different computation times or if computer architectures are heterogeneous. We analyzed memory and CPU utilization of parallel applications with our TraceR profiling tool and found that the load balancing mechanism is not sufficient for parallel tasks with high variance in resource demands. A scheduling strategy needs to know resource demands of a task before execution to efficiently map applications to available resources.
Therefore, we build a regression model to estimate resource demands based on previous evaluated tasks. Resource estimates like runtime are then used to guide our scheduling strategies. Those strategies are integrated in our RAMBO (Resource-Aware Model-Based Optimization) Framework. Compared to standard mechanisms of the parallel package our approach yields improved resource utilization.},
} We present resource-aware scheduling strategies for parallel R programs leading to efficient utilization of parallel computer architectures by estimating resource demands. We concentrate on applications that consist of independent tasks.
The R programming language is increasingly used to process large data sets in parallel, which requires a high amount of resources. One important application is parameter tuning of machine learning algorithms where evaluations need to be executed in parallel to reduce runtime. Here, resource demands of tasks heavily vary depending on the algorithm configuration. Running such an application in a naive parallel way leads to inefficient resource utilization and thus to long runtimes. Therefore, the R package “parallel” offers a scheduling strategy, called “load balancing”.
It dynamically allocates tasks to worker processes. This option is recommended when tasks have widely different computation times or if computer architectures are heterogeneous. We analyzed memory and CPU utilization of parallel applications with our TraceR profiling tool and found that the load balancing mechanism is not sufficient for parallel tasks with high variance in resource demands. A scheduling strategy needs to know resource demands of a task before execution to efficiently map applications to available resources.
Therefore, we build a regression model to estimate resource demands based on previous evaluated tasks. Resource estimates like runtime are then used to guide our scheduling strategies. Those strategies are integrated in our RAMBO (Resource-Aware Model-Based Optimization) Framework. Compared to standard mechanisms of the parallel package our approach yields improved resource utilization.
|
| Helena Kotthaus, Ingo Korb and Peter Marwedel. Performance Analysis for Parallel R Programs: Towards Efficient Resource Utilization. In Abstract Booklet of the International R User Conference (UseR!), pages 66 Aalborg, Denmark, June 2015 [BibTeX][Link]@inproceedings { kotthaus/2015a,
author = {Kotthaus, Helena and Korb, Ingo and Marwedel, Peter},
title = {Performance Analysis for Parallel R Programs: Towards Efficient Resource Utilization},
booktitle = {Abstract Booklet of the International R User Conference (UseR!)},
year = {2015},
pages = {66},
address = {Aalborg, Denmark},
month = {June},
url = {http://user2015.math.aau.dk/docs/useR2015-BookOfAbstracts.pdf},
confidential = {n},
} |
| Helena Kotthaus, Ingo Korb and Peter Marwedel. Distributed Performance Analysis for R. In R Implementation, Optimization and Tooling Workshop (RIOT) Prag, Czech, July 2015 [BibTeX][Link]@inproceedings { kotthaus/2015b,
author = {Kotthaus, Helena and Korb, Ingo and Marwedel, Peter},
title = {Distributed Performance Analysis for R},
booktitle = {R Implementation, Optimization and Tooling Workshop (RIOT)},
year = {2015},
address = {Prag, Czech},
month = {July},
url = {http://2015.ecoop.org/track/RIOT-2015-papers#program},
confidential = {n},
} |
| Andreas Heinig, Florian Schmoll, Björn Bönninghoff, Peter Marwedel and Michael Engel. FAME: Flexible Real-Time Aware Error Correction by Combining Application Knowledge and Run-Time Information. In Proceedings of the 11th Workshop on Silicon Errors in Logic - System Effects (SELSE) 2015 [BibTeX][PDF]@inproceedings { heinig:2015:selse,
author = {Heinig, Andreas and Schmoll, Florian and B\"onninghoff, Bj\"orn and Marwedel, Peter and Engel, Michael},
title = {FAME: Flexible Real-Time Aware Error Correction by Combining Application Knowledge and Run-Time Information},
booktitle = {Proceedings of the 11th Workshop on Silicon Errors in Logic - System Effects (SELSE)},
year = {2015},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2015-heinig-selse2015.pdf},
confidential = {n},
} |
| Olaf Neugebauer, Michael Engel and Peter Marwedel. Multi-Objective Aware Communication Optimization for Resource-Restricted Embedded Systems. In Proceedings of Architecture of Computing Systems. Proceedings, ARCS 2015 [BibTeX][PDF][Abstract]@inproceedings { neugebauer:2015:arcs,
author = {Neugebauer, Olaf and Engel, Michael and Marwedel, Peter},
title = {Multi-Objective Aware Communication Optimization for Resource-Restricted Embedded Systems},
booktitle = {Proceedings of Architecture of Computing Systems. Proceedings, ARCS},
year = {2015},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2015-neugebauer-arcs.pdf},
confidential = {n},
abstract = {Creating efficient parallel software for current embedded multicore systems is a complex and error-prone task. While automatic parallelization tools help to exploit the performance of multicores, most of these systems waste optimization opportunities since they neglect to consider hardware details such as communication performance and memory hierarchies. In addition, most tools do not allow multi-criterial optimization for objectives such as performance and energy. These approaches are especially relevant in the embedded domain. In this paper we present PICO, an approach that enables multi-objective optimization of embedded parallel programs. In combination with a state-of-the-art parallelization approach for sequential C code, PICO uses high-level models and simulators for performance and energy consumption optimization. As a result, PICO generates a set of Pareto-optimal solutions using a genetic algorithm-based optimization. These solutions allow an embedded system designer to choose a parallelization solution which exhibits a suitable trade-off between the required speedup and the resulting energy consumption according to a given system's requirements. Using PICO, we were able to reduce energy consumption by about 35% compared to the sequential execution for a heterogeneous architecture. Further, runtime reductions by roughly 55% were achieved for a benchmark on a homogeneous platform.},
} Creating efficient parallel software for current embedded multicore systems is a complex and error-prone task. While automatic parallelization tools help to exploit the performance of multicores, most of these systems waste optimization opportunities since they neglect to consider hardware details such as communication performance and memory hierarchies. In addition, most tools do not allow multi-criterial optimization for objectives such as performance and energy. These approaches are especially relevant in the embedded domain. In this paper we present PICO, an approach that enables multi-objective optimization of embedded parallel programs. In combination with a state-of-the-art parallelization approach for sequential C code, PICO uses high-level models and simulators for performance and energy consumption optimization. As a result, PICO generates a set of Pareto-optimal solutions using a genetic algorithm-based optimization. These solutions allow an embedded system designer to choose a parallelization solution which exhibits a suitable trade-off between the required speedup and the resulting energy consumption according to a given system's requirements. Using PICO, we were able to reduce energy consumption by about 35% compared to the sequential execution for a heterogeneous architecture. Further, runtime reductions by roughly 55% were achieved for a benchmark on a homogeneous platform.
|
| Olaf Neugebauer, Pascal Libuschewski, Michael Engel, Heinrich Mueller and Peter Marwedel. Plasmon-based Virus Detection on Heterogeneous Embedded Systems. In Proceedings of Workshop on Software & Compilers for Embedded Systems (SCOPES) 2015 [BibTeX][PDF][Abstract]@inproceedings { neugebauer2015:scopes,
author = {Neugebauer, Olaf and Libuschewski, Pascal and Engel, Michael and Mueller, Heinrich and Marwedel, Peter},
title = {Plasmon-based Virus Detection on Heterogeneous Embedded Systems},
booktitle = {Proceedings of Workshop on Software \& Compilers for Embedded Systems (SCOPES) },
year = {2015},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2015-neugebauer-scopes.pdf},
confidential = {n},
abstract = {Embedded systems, e.g. in computer vision applications, are expected to provide significant amounts of computing power to process large data volumes. Many of these systems, such as used in medical diagnosis, are mobile devices and face significant challenges to provide sufficient performance while operating on a constrained energy budget.
Modern embedded MPSoC platforms use heterogeneous CPU and GPU cores providing a large number of optimization parameters. This allows to find useful trade-offs between energy consumption and performance for a given application. In this paper, we describe how the complex data processing required for PAMONO, a novel type of biosensor for the detection of biological viruses, can efficiently be implemented on a state-of-the-art heterogeneous MPSoC platform. An additional optimization dimension explored is the achieved quality of service. Reducing the virus detection accuracy enables additional optimizations not achievable by modifying hardware or software parameters alone.
Instead of relying on often inaccurate simulation models, our design space exploration employs a hardware-in-the-loop approach to evaluate the performance and energy consumption on the embedded target platform. Trade-offs between performance, energy and accuracy are controlled by a genetic algorithm running on a PC control system which deploys the evaluation tasks to a number of connected embedded boards. Using our optimization approach, we are able to achieve frame rates meeting the requirements without losing accuracy. Further, our approach is able to reduce the energy consumption by 93% with a still reasonable detection quality.},
} Embedded systems, e.g. in computer vision applications, are expected to provide significant amounts of computing power to process large data volumes. Many of these systems, such as used in medical diagnosis, are mobile devices and face significant challenges to provide sufficient performance while operating on a constrained energy budget.
Modern embedded MPSoC platforms use heterogeneous CPU and GPU cores providing a large number of optimization parameters. This allows to find useful trade-offs between energy consumption and performance for a given application. In this paper, we describe how the complex data processing required for PAMONO, a novel type of biosensor for the detection of biological viruses, can efficiently be implemented on a state-of-the-art heterogeneous MPSoC platform. An additional optimization dimension explored is the achieved quality of service. Reducing the virus detection accuracy enables additional optimizations not achievable by modifying hardware or software parameters alone.
Instead of relying on often inaccurate simulation models, our design space exploration employs a hardware-in-the-loop approach to evaluate the performance and energy consumption on the embedded target platform. Trade-offs between performance, energy and accuracy are controlled by a genetic algorithm running on a PC control system which deploys the evaluation tasks to a number of connected embedded boards. Using our optimization approach, we are able to achieve frame rates meeting the requirements without losing accuracy. Further, our approach is able to reduce the energy consumption by 93% with a still reasonable detection quality.
|
| Helena Kotthaus, Ingo Korb, Markus Künne and Peter Marwedel. Performance Analysis for R: Towards a Faster R Interpreter. In Abstract Booklet of the International R User Conference (UseR!), pages 104 Los Angeles, USA, July 2014 [BibTeX][Link]@inproceedings { kotthaus/2014b,
author = {Kotthaus, Helena and Korb, Ingo and K\"unne, Markus and Marwedel, Peter},
title = {Performance Analysis for R: Towards a Faster R Interpreter},
booktitle = {Abstract Booklet of the International R User Conference (UseR!)},
year = {2014},
pages = {104},
address = { Los Angeles, USA},
month = {jul},
url = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/kotthaus_user2014.pdf},
confidential = {n},
} |
| Helena Kotthaus, Ingo Korb, Michael Engel and Peter Marwedel. Dynamic Page Sharing Optimization for the R Language . In Proceedings of the 10th Symposium on Dynamic Languages, pages 79--90 Portland, Oregon, USA, October 2014 [BibTeX][PDF][Link][Abstract]@inproceedings { kotthaus/2014e,
author = {Kotthaus, Helena and Korb, Ingo and Engel, Michael and Marwedel, Peter},
title = {Dynamic Page Sharing Optimization for the R Language },
booktitle = {Proceedings of the 10th Symposium on Dynamic Languages},
year = {2014},
series = {DLS '14},
pages = {79--90},
address = {Portland, Oregon, USA},
month = {oct},
publisher = {ACM},
url = {http://dl.acm.org/citation.cfm?id=2661094},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2014e_kotthaus.pdf},
confidential = {n},
abstract = {
Dynamic languages such as R are increasingly used to process .large data sets. Here, the R interpreter induces a large memory overhead due to wasteful memory allocation policies. If an application's working set exceeds the available physical memory, the OS starts to swap, resulting in slowdowns of a several orders of magnitude. Thus, memory optimizations for R will be beneficial to many applications.
Existing R optimizations are mostly based on dynamic compilation or native libraries. Both methods are futile when the OS starts to page out memory. So far, only a few, data-type or application specific memory optimizations for R exist. To remedy this situation, we present a low-overhead page sharing approach for R that significantly reduces the interpreter's memory overhead. Concentrating on the most rewarding optimizations avoids the high runtime overhead of existing generic approaches for memory deduplication or compression. In addition, by applying knowledge of interpreter data structures and memory allocation patterns, our approach is not constrained to specific R applications and is transparent to the R interpreter.
Our page sharing optimization enables us to reduce the memory consumption by up to 53.5% with an average of 18.0% for a set of real-world R benchmarks with a runtime overhead of only 5.3% on average. In cases where page I/O can be avoided, significant speedups are achieved.
},
}
Dynamic languages such as R are increasingly used to process .large data sets. Here, the R interpreter induces a large memory overhead due to wasteful memory allocation policies. If an application's working set exceeds the available physical memory, the OS starts to swap, resulting in slowdowns of a several orders of magnitude. Thus, memory optimizations for R will be beneficial to many applications.
Existing R optimizations are mostly based on dynamic compilation or native libraries. Both methods are futile when the OS starts to page out memory. So far, only a few, data-type or application specific memory optimizations for R exist. To remedy this situation, we present a low-overhead page sharing approach for R that significantly reduces the interpreter's memory overhead. Concentrating on the most rewarding optimizations avoids the high runtime overhead of existing generic approaches for memory deduplication or compression. In addition, by applying knowledge of interpreter data structures and memory allocation patterns, our approach is not constrained to specific R applications and is transparent to the R interpreter.
Our page sharing optimization enables us to reduce the memory consumption by up to 53.5% with an average of 18.0% for a set of real-world R benchmarks with a runtime overhead of only 5.3% on average. In cases where page I/O can be avoided, significant speedups are achieved.
|
| Chen-Wei Huang, Timon Kelter, Bjoern Boenninghoff, Jan Kleinsorge, Michael Engel, Peter Marwedel and Shiao-Li Tsao. Static WCET Analysis of the H.264/AVC Decoder Exploiting Coding Information. In International Conference on Embedded and Real-Time Computing Systems and Applications Chongqing, China, August 2014 [BibTeX]@inproceedings { huang:2014:rtcsa,
author = {Huang, Chen-Wei and Kelter, Timon and Boenninghoff, Bjoern and Kleinsorge, Jan and Engel, Michael and Marwedel, Peter and Tsao, Shiao-Li},
title = {Static WCET Analysis of the H.264/AVC Decoder Exploiting Coding Information},
booktitle = {International Conference on Embedded and Real-Time Computing Systems and Applications},
year = {2014},
address = {Chongqing, China},
month = {August},
organization = {IEEE},
keywords = {wcet},
confidential = {n},
} |
| Andreas Heinig, Florian Schmoll, Peter Marwedel and Michael Engel. Who's Using that Memory? A Subscriber Model for Mapping Errors to Tasks. In Proceedings of the 10th Workshop on Silicon Errors in Logic - System Effects (SELSE) Stanford, CA, USA, April 2014 [BibTeX][PDF][Abstract]@inproceedings { heinig:2014:SELSE,
author = {Heinig, Andreas and Schmoll, Florian and Marwedel, Peter and Engel, Michael},
title = {Who's Using that Memory? A Subscriber Model for Mapping Errors to Tasks},
booktitle = {Proceedings of the 10th Workshop on Silicon Errors in Logic - System Effects (SELSE)},
year = {2014},
address = {Stanford, CA, USA},
month = {April},
keywords = {ders},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2014-heinig-selse2014.pdf},
confidential = {n},
abstract = {In order to assess the robustness of software-based fault-tolerance
methods, extensive tests have to be performed that
inject faults, such as bit flips, into hardware components of a running
system. Fault injection commonly uses either system simulations, resulting
in execution times orders of magnitude longer than on real systems, or
exposes a real system to error sources like radiation. This can take place
in real time, but it enables only a very coarse-grained control over the
affected system component.
A solution combining the best characteristics from both approaches should
achieve precise fault injection in real hardware systems. The approach
presented in this paper uses the JTAG background debug facility of a CPU
to inject faults into main memory and registers of a running system. Compared
to similar earlier approaches, our solution is able to achieve rapid
fault injection using a low-cost microcontroller instead of a complex
FPGA. Consequently, our injection software is much more flexible. It
allows to restrict error injection to the execution of a set of predefined
components, resulting in a more precise control of the injection, and
also emulates error reporting, which enables the evaluation
of different error detection approaches in addition to robustness
evaluation.
},
} In order to assess the robustness of software-based fault-tolerance
methods, extensive tests have to be performed that
inject faults, such as bit flips, into hardware components of a running
system. Fault injection commonly uses either system simulations, resulting
in execution times orders of magnitude longer than on real systems, or
exposes a real system to error sources like radiation. This can take place
in real time, but it enables only a very coarse-grained control over the
affected system component.
A solution combining the best characteristics from both approaches should
achieve precise fault injection in real hardware systems. The approach
presented in this paper uses the JTAG background debug facility of a CPU
to inject faults into main memory and registers of a running system. Compared
to similar earlier approaches, our solution is able to achieve rapid
fault injection using a low-cost microcontroller instead of a complex
FPGA. Consequently, our injection software is much more flexible. It
allows to restrict error injection to the execution of a set of predefined
components, resulting in a more precise control of the injection, and
also emulates error reporting, which enables the evaluation
of different error detection approaches in addition to robustness
evaluation.
|
| Timon Kelter and Peter Marwedel. Parallelism Analysis: Precise WCET Values for Complex Multi-Core Systems. In Third International Workshop on Formal Techniques for Safety-Critical Systems Luxembourg, November 2014 [BibTeX][PDF][Link]@inproceedings { kelter:2014:ftscs,
author = {Kelter, Timon and Marwedel, Peter},
title = {Parallelism Analysis: Precise WCET Values for Complex Multi-Core Systems},
booktitle = {Third International Workshop on Formal Techniques for Safety-Critical Systems},
year = {2014},
editor = {Cyrille Artho and Peter \"Olveczky},
series = {FTSCS},
address = {Luxembourg},
month = {November},
publisher = {Springer},
url = {http://www.ftscs.org/index.php?n=Main.Home},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2014-kelter-ftscs.pdf},
confidential = {n},
} |
| Timon Kelter, Peter Marwedel and Hendrik Borghorst. WCET-aware Scheduling Optimizations for Multi-Core Real-Time Systems. In International Conference on Embedded Computer Systems: Architectures, Modeling, and Simulation (SAMOS), pages 67-74 Samos, Greece, July 2014 [BibTeX][PDF]@inproceedings { kelter:2014:samos,
author = {Kelter, Timon and Marwedel, Peter and Borghorst, Hendrik},
title = {WCET-aware Scheduling Optimizations for Multi-Core Real-Time Systems},
booktitle = {International Conference on Embedded Computer Systems: Architectures, Modeling, and Simulation (SAMOS)},
year = {2014},
pages = {67-74},
address = {Samos, Greece},
month = {July},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2014-samos.pdf},
confidential = {n},
} |
| Bjoern Dusza, Peter Marwedel, Olaf Spinczyk and Christian Wietfeld. A Context-Aware Battery Lifetime Model for Carrier Aggregation Enabled LTE-A Systems. In IEEE Consumer Communications and Networking Conference Las Vegas, USA, January 2014 [BibTeX][Abstract]@inproceedings { marwedel:2014:ccnc,
author = {Dusza, Bjoern and Marwedel, Peter and Spinczyk, Olaf and Wietfeld, Christian},
title = {A Context-Aware Battery Lifetime Model for Carrier Aggregation Enabled LTE-A Systems},
booktitle = {IEEE Consumer Communications and Networking Conference},
year = {2014},
series = {CCNC},
address = {Las Vegas, USA},
month = {January},
organization = {IEEE},
keywords = {energy},
confidential = {n},
abstract = {A Quality of Experience (QoE) parameter of increasing importance is the time that a battery powered
communication device (e.g. smartphone) can be operated before it needs to be recharged. However, due to the fact that battery capacity is not evolving as fast as the power requirement, the battery lifetime of modern user equipment is
stagnating or even decreasing from one device generation to another. In parallel, a major challenge for the design of
next generation wireless systems such as LTE-Advanced (LTE-A) is that the required high portion of spectrum is not
available in a consecutive portion. For this reason, a procedure called interband non-continuous Carrier Aggregation
(CA) will be introduced in LTE-A which allows for the combination of multiple spectrum pieces from different frequency
bands. This procedure however requires the parallel operation of multiple power amplifiers that are characterized by a
high energy demand. In this paper, we quantify the impact of CA on the power consumption of LTE-A enabled communication by means of a Markovian based power consumption model that incorporates system parameters as well as context parameters. The results show that the suitability of CA does from a battery lifetime perspective strongly depend upon the actual device characteristics as well as the resource availability is the various frequency bands. Furthermore, the application of the sophisticated Kinetic Battery Model (KiBaM) shows that the charge recovery effect during idle periods does significantly affect the battery lifetime.},
} A Quality of Experience (QoE) parameter of increasing importance is the time that a battery powered
communication device (e.g. smartphone) can be operated before it needs to be recharged. However, due to the fact that battery capacity is not evolving as fast as the power requirement, the battery lifetime of modern user equipment is
stagnating or even decreasing from one device generation to another. In parallel, a major challenge for the design of
next generation wireless systems such as LTE-Advanced (LTE-A) is that the required high portion of spectrum is not
available in a consecutive portion. For this reason, a procedure called interband non-continuous Carrier Aggregation
(CA) will be introduced in LTE-A which allows for the combination of multiple spectrum pieces from different frequency
bands. This procedure however requires the parallel operation of multiple power amplifiers that are characterized by a
high energy demand. In this paper, we quantify the impact of CA on the power consumption of LTE-A enabled communication by means of a Markovian based power consumption model that incorporates system parameters as well as context parameters. The results show that the suitability of CA does from a battery lifetime perspective strongly depend upon the actual device characteristics as well as the resource availability is the various frequency bands. Furthermore, the application of the sophisticated Kinetic Battery Model (KiBaM) shows that the charge recovery effect during idle periods does significantly affect the battery lifetime.
|
| Peter Marwedel and Michael Engel. Flipped classroom teaching for a cyber-physical system course - an adequate presence-based learning approach in the internet age. In Proceedings of the Tenth European Workshop on Microelectronics Education (EWME) Tallinn, Estonia, May 2014 [BibTeX][PDF][Abstract]@inproceedings { marwedel:2014:ewme,
author = {Marwedel, Peter and Engel, Michael},
title = {Flipped classroom teaching for a cyber-physical system course - an adequate presence-based learning approach in the internet age},
booktitle = {Proceedings of the Tenth European Workshop on Microelectronics Education (EWME)},
year = {2014},
address = {Tallinn, Estonia},
month = {May},
publisher = {IEEE},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2014-ewme.pdf},
confidential = {n},
abstract = {In the age of the Internet, teaching styles need to take new ways of learning into account. This paper recommends the use of the flipped classroom approach. In this approach, the roles of work at home and in class are essentially swapped. We present a case study covering a course on cyber-physical system fundamentals. Results are strongly encouraging us to continue along these lines. We are also commenting on general advantages and limitations of this style of teaching.},
} In the age of the Internet, teaching styles need to take new ways of learning into account. This paper recommends the use of the flipped classroom approach. In this approach, the roles of work at home and in class are essentially swapped. We present a case study covering a course on cyber-physical system fundamentals. Results are strongly encouraging us to continue along these lines. We are also commenting on general advantages and limitations of this style of teaching.
|
| Olaf Neugebauer, Michael Engel and Peter Marwedel. A Parallelization Approach for Resource Restricted Embedded Heterogeneous MPSoCs Inspired by OpenMP. In Proceedings of Software Engineering for Parallel Systems (SEPS) 2014 [BibTeX]@inproceedings { neugebauer:2014:seps,
author = {Neugebauer, Olaf and Engel, Michael and Marwedel, Peter},
title = {A Parallelization Approach for Resource Restricted Embedded Heterogeneous MPSoCs Inspired by OpenMP},
booktitle = {Proceedings of Software Engineering for Parallel Systems (SEPS)},
year = {2014},
confidential = {n},
} |
| Jan Kleinsorge and Peter Marwedel. Computing Maximum Blocking Times with Explicit Path Analysis under Non-local Flow Bounds. In Proceedings of the International Conference on Embedded Software (EMSOFT 2014) New Delhi, India, October 2014 [BibTeX][Link]@inproceedings { Kleinsorge:2014:EMSOFT,
author = {Kleinsorge, Jan and Marwedel, Peter},
title = {Computing Maximum Blocking Times with Explicit Path Analysis under Non-local Flow Bounds},
booktitle = {Proceedings of the International Conference on Embedded Software (EMSOFT 2014)},
year = {2014},
series = {EMSOFT 2014},
address = {New Delhi, India},
month = {oct},
url = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2014-jk.pdf},
confidential = {n},
} |
| Pascal Libuschewski, Dennis Kaulbars, Dominic Siedhoff, Frank Weichert, Heinrich Müller, Christian Wietfeld and Peter Marwedel. Multi-Objective Computation Offloading for Mobile Biosensors via LTE. In Wireless Mobile Communication and Healthcare (Mobihealth), 2014 EAI 4th International Conference on December 2014 [BibTeX][PDF][Link][Abstract]@inproceedings { Libuschewski/etal/2014a,
author = {Libuschewski, Pascal and Kaulbars, Dennis and Siedhoff, Dominic and Weichert, Frank and M\"uller, Heinrich and Wietfeld, Christian and Marwedel, Peter},
title = {Multi-Objective Computation Offloading for Mobile Biosensors via LTE},
booktitle = {Wireless Mobile Communication and Healthcare (Mobihealth), 2014 EAI 4th International Conference on},
year = {2014},
month = {Dec},
url = {http://dx.doi.org/10.4108/icst.mobihealth.2014.257374},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2014-mobihealth.pdf},
confidential = {n},
abstract = {For a rapid identification of viral epidemics a mobile virus detection is needed, which can process samples without a laboratory. The application of medical biosensors, at key positions with a high passenger volume (e.g. airports), became increasingly meaningful as epidemic early warning systems. As mobile biosensors have to fulfill various demands, like a rapid analysis and prolonged battery lifetime we present in this study a multi-objective computation offloading for mobile sensors. The decision whether it is beneficial to offload work to a server can be made automatically on the basis of contrary objectives and several constraints.},
} For a rapid identification of viral epidemics a mobile virus detection is needed, which can process samples without a laboratory. The application of medical biosensors, at key positions with a high passenger volume (e.g. airports), became increasingly meaningful as epidemic early warning systems. As mobile biosensors have to fulfill various demands, like a rapid analysis and prolonged battery lifetime we present in this study a multi-objective computation offloading for mobile sensors. The decision whether it is beneficial to offload work to a server can be made automatically on the basis of contrary objectives and several constraints.
|
| Pascal Libuschewski, Peter Marwedel, Dominic Siedhoff and Müller Heinrich. Multi-Objective Energy-Aware GPGPU Design Space Exploration for Medical or Industrial Applications. In Signal-Image Technology and Internet-Based Systems (SITIS), 2014 Tenth International Conference on, pages 637-644 November 2014, doi 10.1109/SITIS.2014.11 [BibTeX][PDF][Link][Abstract]@inproceedings { Libuschewski/etal/2014b,
author = {Libuschewski, Pascal and Marwedel, Peter and Siedhoff, Dominic and Heinrich, M\"uller},
title = {Multi-Objective Energy-Aware GPGPU Design Space Exploration for Medical or Industrial Applications},
booktitle = {Signal-Image Technology and Internet-Based Systems (SITIS), 2014 Tenth International Conference on},
year = {2014},
pages = {637-644},
month = {Nov},
publisher = {IEEE Computer Society},
note = {doi 10.1109/SITIS.2014.11},
url = {dx.doi.org/10.1109/SITIS.2014.11},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2014-sitis.pdf},
confidential = {n},
abstract = {This work presents a multi-objective design space exploration for Graphics Processing Units (GPUs). For any given GPGPU application, a Pareto front of best suited GPUs can be calculated. The objectives can be chosen according to the demands of the system, for example energy efficiency, run time and real-time capability. The simulated GPUs can be desktop, high performance or mobile versions. Also GPUs that do not yet exist can be modeled and simulated.
The main application area for the presented approach is the identification of suitable GPU hardware for given medical or industrial applications, e.g. for real-time process control or in healthcare sensor environments. As use case a real-time capable medical biosensor program for an automatic detection of pathogens and a wide variety of industrial, biological and physical applications were evaluated.},
} This work presents a multi-objective design space exploration for Graphics Processing Units (GPUs). For any given GPGPU application, a Pareto front of best suited GPUs can be calculated. The objectives can be chosen according to the demands of the system, for example energy efficiency, run time and real-time capability. The simulated GPUs can be desktop, high performance or mobile versions. Also GPUs that do not yet exist can be modeled and simulated.
The main application area for the presented approach is the identification of suitable GPU hardware for given medical or industrial applications, e.g. for real-time process control or in healthcare sensor environments. As use case a real-time capable medical biosensor program for an automatic detection of pathogens and a wide variety of industrial, biological and physical applications were evaluated.
|
| Helena Kotthaus, Michel Lang, Jörg Rahnenführer and Peter Marwedel. Runtime and Memory Consumption Analyses for Machine Learning R Programs. In Abstracts 45. Arbeitstagung, Ulmer Informatik-Berichte, pages 3-4 June 2013 [BibTeX][PDF]@inproceedings { kotthaus/2013a,
author = {Kotthaus, Helena and Lang, Michel and Rahnenf{\"u}hrer, J{\"o}rg and Marwedel, Peter},
title = {Runtime and Memory Consumption Analyses for Machine Learning R Programs},
booktitle = {Abstracts 45. Arbeitstagung, Ulmer Informatik-Berichte},
year = {2013},
pages = {3-4},
month = {jun},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/kotthaus_etal_2013a.pdf},
confidential = {n},
} |
| Daniel Cordes, Michael Engel, Olaf Neugebauer and Peter Marwedel. Automatic Extraction of Task-Level Parallelism for Heterogeneous MPSoCs. In Proceedings of the Fourth International Workshop on Parallel Software Tools and Tool Infrastructures (PSTI 2013) Lyon, France, October 2013 [BibTeX][PDF][Abstract]@inproceedings { Cordes:2013:PSTI,
author = {Cordes, Daniel and Engel, Michael and Neugebauer, Olaf and Marwedel, Peter},
title = {Automatic Extraction of Task-Level Parallelism for Heterogeneous MPSoCs},
booktitle = {Proceedings of the Fourth International Workshop on Parallel Software Tools and Tool Infrastructures (PSTI 2013)},
year = {2013},
series = {PSTI 2013},
address = {Lyon, France},
month = {oct},
keywords = {automatic parallelization; embedded software; heterogeneity; mpsoc; integer linear programming; task-level parallelism},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2013-psti-cordes.pdf},
confidential = {n},
abstract = {Heterogeneous multi-core platforms are increasingly attractive for embedded applications due to their adaptability and efficiency. This proliferation of heterogeneity demands new approaches for extracting thread level parallelism from sequential applications which have to be efficient at runtime. We present, to the best of our knowledge, the first Integer Linear Programming (ILP)-based parallelization approach for heterogeneous multi-core platforms. Using Hierarchical Task Graphs and high-level timing models, our approach manages to balance the extracted tasks while considering performance differences between cores. As a result, we obtain considerable speedups at runtime, significantly outperforming tools for homogeneous systems. We evaluate our approach by parallelizing standard benchmarks from various application domains.},
} Heterogeneous multi-core platforms are increasingly attractive for embedded applications due to their adaptability and efficiency. This proliferation of heterogeneity demands new approaches for extracting thread level parallelism from sequential applications which have to be efficient at runtime. We present, to the best of our knowledge, the first Integer Linear Programming (ILP)-based parallelization approach for heterogeneous multi-core platforms. Using Hierarchical Task Graphs and high-level timing models, our approach manages to balance the extracted tasks while considering performance differences between cores. As a result, we obtain considerable speedups at runtime, significantly outperforming tools for homogeneous systems. We evaluate our approach by parallelizing standard benchmarks from various application domains.
|
| Timon Kelter, Tim Harde, Peter Marwedel and Heiko Falk. Evaluation of resource arbitration methods for multi-core real-time systems. In Proceedings of the 13th International Workshop on Worst-Case Execution Time Analysis (WCET) Paris, France, July 2013 [BibTeX][PDF][Link][Abstract]@inproceedings { kelter:2013:wcet,
author = {Kelter, Timon and Harde, Tim and Marwedel, Peter and Falk, Heiko},
title = {Evaluation of resource arbitration methods for multi-core real-time systems},
booktitle = {Proceedings of the 13th International Workshop on Worst-Case Execution Time Analysis (WCET)},
year = {2013},
editor = {Claire Maiza},
address = {Paris, France},
month = {July},
url = {http://wcet2013.imag.fr/},
keywords = {wcet},
file = {http://drops.dagstuhl.de/opus/volltexte/2013/4117/pdf/2.pdf},
confidential = {n},
abstract = {Multi-core systems have become prevalent in the last years, because of their favorable properties in terms of energy consumption, computing power and design complexity. First attempts have been made to devise WCET analyses for multi-core processors, which have to deal with the problem that the cores may experience interferences during accesses to shared resources. To limit these interferences, the vast amount of previous work is proposing a strict TDMA (time division multiple access) schedule for arbitrating shared resources. Though this type of arbitration yields a high predictability, this advantage is paid for with a poor resource utilization. In this work, we compare different arbitration methods with respect to their predictability and average case performance. We show how known WCET analysis techniques can be extended to work with the presented arbitration strategies and perform an evaluation of the resulting ACETs and WCETs on an extensive set of realworld benchmarks. Results show that there are cases when TDMA is not the best strategy, especially when predictability and performance are equally important.},
} Multi-core systems have become prevalent in the last years, because of their favorable properties in terms of energy consumption, computing power and design complexity. First attempts have been made to devise WCET analyses for multi-core processors, which have to deal with the problem that the cores may experience interferences during accesses to shared resources. To limit these interferences, the vast amount of previous work is proposing a strict TDMA (time division multiple access) schedule for arbitrating shared resources. Though this type of arbitration yields a high predictability, this advantage is paid for with a poor resource utilization. In this work, we compare different arbitration methods with respect to their predictability and average case performance. We show how known WCET analysis techniques can be extended to work with the presented arbitration strategies and perform an evaluation of the resulting ACETs and WCETs on an extensive set of realworld benchmarks. Results show that there are cases when TDMA is not the best strategy, especially when predictability and performance are equally important.
|
| Daniel Cordes, Michael Engel, Olaf Neugebauer and Peter Marwedel. Automatic Extraction of Pipeline Parallelism for Embedded Heterogeneous Multi-Core Platforms. In Proceedings of the Sixteenth International Conference on Compilers, Architectures, and Synthesis for Embedded Systems (CASES 2013) Montreal, Canada, October 2013 [BibTeX][PDF][Abstract]@inproceedings { Cordes:2013:CASES,
author = {Cordes, Daniel and Engel, Michael and Neugebauer, Olaf and Marwedel, Peter},
title = {Automatic Extraction of Pipeline Parallelism for Embedded Heterogeneous Multi-Core Platforms},
booktitle = {Proceedings of the Sixteenth International Conference on Compilers, Architectures, and Synthesis for Embedded Systems (CASES 2013)},
year = {2013},
series = {CASES 2013},
address = {Montreal, Canada},
month = {oct},
keywords = {Automatic Parallelization; Heterogeneity; MPSoC; Embedded Software; Integer Linear Programming; Pipeline},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2013-cases-cordes.pdf},
confidential = {n},
abstract = {Automatic parallelization of sequential applications is the key for efficient use and optimization of current and future embedded multi-core systems. However, existing approaches often fail to achieve efficient balancing of tasks running on heterogeneous cores of an MPSoC. A reason for this is often insufficient knowledge of the underlying architecture's performance.
In this paper, we present a novel parallelization approach for embedded MPSoCs that combines pipeline parallelization for loops with knowledge about different execution times for tasks on cores with different performance properties. Using Integer Linear Programming, an optimal solution with respect to the model used is derived implementing tasks with a well-balanced execution behavior. We evaluate our pipeline parallelization approach for heterogeneous MPSoCs using a set of standard embedded benchmarks and compare it with two existing state-of-the-art approaches. For all benchmarks, our parallelization approach obtains significantly higher speedups than either approach on heterogeneous MPSoCs.
},
} Automatic parallelization of sequential applications is the key for efficient use and optimization of current and future embedded multi-core systems. However, existing approaches often fail to achieve efficient balancing of tasks running on heterogeneous cores of an MPSoC. A reason for this is often insufficient knowledge of the underlying architecture's performance.
In this paper, we present a novel parallelization approach for embedded MPSoCs that combines pipeline parallelization for loops with knowledge about different execution times for tasks on cores with different performance properties. Using Integer Linear Programming, an optimal solution with respect to the model used is derived implementing tasks with a well-balanced execution behavior. We evaluate our pipeline parallelization approach for heterogeneous MPSoCs using a set of standard embedded benchmarks and compare it with two existing state-of-the-art approaches. For all benchmarks, our parallelization approach obtains significantly higher speedups than either approach on heterogeneous MPSoCs.
|
| Andreas Heinig, Ingo Korb, Florian Schmoll, Peter Marwedel and Michael Engel. Fast and Low-Cost Instruction-Aware Fault Injection. In Proc. of SOBRES 2013 2013 [BibTeX][Link][Abstract]@inproceedings { heinig:2013:sobres,
author = {Heinig, Andreas and Korb, Ingo and Schmoll, Florian and Marwedel, Peter and Engel, Michael},
title = {Fast and Low-Cost Instruction-Aware Fault Injection},
booktitle = {Proc. of SOBRES 2013},
year = {2013},
url = {http://danceos.org/sobres/2013/papers/SOBRES-640-Heinig.pdf},
keywords = {ders},
confidential = {n},
abstract = {In order to assess the robustness of software-based fault-tolerance methods, extensive tests have to be performed that inject faults, such as bit flips, into hardware components of a running system. Fault injection commonly uses either system simulations, resulting in execution times orders of magnitude longer than on real systems, or exposes a real system to error sources like radiation. This can take place in real time, but it enables only a very coarse-grained control over the affected system component.
A solution combining the best characteristics from both approaches should achieve precise fault injection in real hardware systems. The approach presented in this paper uses the JTAG background debug facility of a CPU to inject faults into main memory and registers of a running system. Compared to similar earlier approaches, our solution is able to achieve rapid fault injection using a low-cost microcontroller instead of a complex FPGA. Consequently, our injection software is much more flexible. It allows to restrict error injection to the execution of a set of predefined components, resulting in a more precise control of the injection, and also emulates error reporting, which enables the evaluation of different error detection approaches in addition to robustness evaluation.},
} In order to assess the robustness of software-based fault-tolerance methods, extensive tests have to be performed that inject faults, such as bit flips, into hardware components of a running system. Fault injection commonly uses either system simulations, resulting in execution times orders of magnitude longer than on real systems, or exposes a real system to error sources like radiation. This can take place in real time, but it enables only a very coarse-grained control over the affected system component.
A solution combining the best characteristics from both approaches should achieve precise fault injection in real hardware systems. The approach presented in this paper uses the JTAG background debug facility of a CPU to inject faults into main memory and registers of a running system. Compared to similar earlier approaches, our solution is able to achieve rapid fault injection using a low-cost microcontroller instead of a complex FPGA. Consequently, our injection software is much more flexible. It allows to restrict error injection to the execution of a set of predefined components, resulting in a more precise control of the injection, and also emulates error reporting, which enables the evaluation of different error detection approaches in addition to robustness evaluation.
|
| Daniel Cordes, Michael Engel, Olaf Neugebauer and Peter Marwedel. Automatic Extraction of Multi-Objective Aware Parallelism for Heterogeneous MPSoCs. In Proceedings of the Sixth International Workshop on Multi-/Many-core Computing Systems (MuCoCoS 2013) Edinburgh, Scotland, UK, September 2013 [BibTeX][PDF][Abstract]@inproceedings { Cordes:2013:MUCOCOS,
author = {Cordes, Daniel and Engel, Michael and Neugebauer, Olaf and Marwedel, Peter},
title = {Automatic Extraction of Multi-Objective Aware Parallelism for Heterogeneous MPSoCs},
booktitle = {Proceedings of the Sixth International Workshop on Multi-/Many-core Computing Systems (MuCoCoS 2013)},
year = {2013},
series = {MuCoCoS 2013},
address = {Edinburgh, Scotland, UK},
month = {sep},
keywords = {automatic parallelization; embedded software; heterogeneity; mpsoc; genetic algorithms; task-level parallelism; pipeline parallelism; multi-objective},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2013-mucocos-cordes.pdf},
confidential = {n},
abstract = {Heterogeneous MPSoCs are used in a large fraction of current embedded systems. In order to efficiently exploit the available processing power, advanced parallelization techniques are required. In addition to consider performance variances between heterogeneous cores, these methods have to be multi-objective aware to be useful for resource restricted embedded systems. This multi-objective optimization requirement results in an explosion of the design space size. As a consequence, efficient approaches are required to find promising solution candidates. In this paper, we present the first portable genetic algorithm-based approach to speed up ANSI-C applications by combining extraction techniques for task-level and pipeline parallelism for heterogeneous multicores while considering additional objectives.
Using our approach enables embedded system designers to select a parallelization of an application from a set of Pareto-optimal solutions according to the performance and energy consumption requirements of a given system. The evaluation of a large set of typical embedded benchmarks shows that our approach is able to generate solutions with low energy consumption, high speedup, low communication overhead or useful trade-offs between these three objectives.},
} Heterogeneous MPSoCs are used in a large fraction of current embedded systems. In order to efficiently exploit the available processing power, advanced parallelization techniques are required. In addition to consider performance variances between heterogeneous cores, these methods have to be multi-objective aware to be useful for resource restricted embedded systems. This multi-objective optimization requirement results in an explosion of the design space size. As a consequence, efficient approaches are required to find promising solution candidates. In this paper, we present the first portable genetic algorithm-based approach to speed up ANSI-C applications by combining extraction techniques for task-level and pipeline parallelism for heterogeneous multicores while considering additional objectives.
Using our approach enables embedded system designers to select a parallelization of an application from a set of Pareto-optimal solutions according to the performance and energy consumption requirements of a given system. The evaluation of a large set of typical embedded benchmarks shows that our approach is able to generate solutions with low energy consumption, high speedup, low communication overhead or useful trade-offs between these three objectives.
|
| Jan Kleinsorge, Heiko Falk and Peter Marwedel. Simple Analysis of Partial Worst-case Execution Paths on General Control Flow Graphs. In Proceedings of the International Conference on Embedded Software (EMSOFT 2013) Montreal, Canada, October 2013 [BibTeX][Link]@inproceedings { Kleinsorge:2013:EMSOFT,
author = {Kleinsorge, Jan and Falk, Heiko and Marwedel, Peter},
title = {Simple Analysis of Partial Worst-case Execution Paths on General Control Flow Graphs},
booktitle = {Proceedings of the International Conference on Embedded Software (EMSOFT 2013)},
year = {2013},
series = {EMSOFT 2013},
address = {Montreal, Canada},
month = {oct},
url = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2013_emsoft.pdf},
keywords = {wcet; Worst-case Execution Time; Path Analysis; Static Analysis},
confidential = {n},
} |
| Daniel Cordes and Peter Marwedel. Multi-Objective Aware Extraction of Task-Level Parallelism Using Genetic Algorithms. In Proceedings of Design, Automation and Test in Europe (DATE 2012) Dresden, Germany, March 2012 [BibTeX][PDF][Abstract]@inproceedings { cordes:12:date,
author = {Cordes, Daniel and Marwedel, Peter},
title = {Multi-Objective Aware Extraction of Task-Level Parallelism Using Genetic Algorithms},
booktitle = {Proceedings of Design, Automation and Test in Europe (DATE 2012)},
year = {2012},
address = {Dresden, Germany},
month = {mar},
keywords = {Automatic Parallelization, Embedded Software, Multi-Objective, Genetic Algorithms, Task-Level Parallelism, Energy awareness},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2012-date-cordes.pdf},
confidential = {n},
abstract = {A large amount of research work has been done in the area of automatic parallelization for decades, resulting in a huge amount of tools, which should relieve the designer from the burden of manually parallelizing an application. Unfortunately, most of these tools are only optimizing the execution time by splitting up applications into concurrently executed tasks. In the domain of embedded devices, however, it is not sufficient to look only at this criterion. Since most of these devices are constraint-driven regarding execution time, energy consumption, heat dissipation and other objectives, a good trade-off has to be found to efficiently map applications to multiprocessor system on chip (MPSoC) devices. Therefore, we developed a fully automated multi-objective aware parallelization framework, which optimizes different objectives at the same time. The tool returns a Pareto-optimal front of solutions of the parallelized application to the designer, so that the solution with the best trade-off can be chosen.},
} A large amount of research work has been done in the area of automatic parallelization for decades, resulting in a huge amount of tools, which should relieve the designer from the burden of manually parallelizing an application. Unfortunately, most of these tools are only optimizing the execution time by splitting up applications into concurrently executed tasks. In the domain of embedded devices, however, it is not sufficient to look only at this criterion. Since most of these devices are constraint-driven regarding execution time, energy consumption, heat dissipation and other objectives, a good trade-off has to be found to efficiently map applications to multiprocessor system on chip (MPSoC) devices. Therefore, we developed a fully automated multi-objective aware parallelization framework, which optimizes different objectives at the same time. The tool returns a Pareto-optimal front of solutions of the parallelized application to the designer, so that the solution with the best trade-off can be chosen.
|
| Michael Engel and Peter Marwedel. Semantic Gaps in Software-Based Reliability. In Proceedings of the 4th Workshop on Design for Reliability (DFR'12) Paris, France, January 2012 [BibTeX][Abstract]@inproceedings { engel:dfr:2012,
author = {Engel, Michael and Marwedel, Peter},
title = {Semantic Gaps in Software-Based Reliability},
booktitle = {Proceedings of the 4th Workshop on Design for Reliability (DFR'12)},
year = {2012},
address = {Paris, France},
month = {jan},
organization = {HiPEAC},
keywords = {ders},
confidential = {n},
abstract = {Future semiconductors will show a heterogeneous distribution of permanent faults as a result of fabrication variations and aging. To increase yields and lifetimes of these chips, a fault tolerance approach is required that handles resources on a small-scale basis with low overhead. In embedded systems, this overhead can be reduced by classifying data and instructions to determine the varying impact of errors on different instructions and data. Using this classification, only errors with significant impact on system behavior have to be corrected.
In this position paper, we describe one problem with this analysis, the semantic gap between high-level language source code and the low-level data flow through architecture components. In addition, we discuss possible approaches to handle this gap. Of special interest are the implications on achieving reliable execution of dependability-critical code.
},
} Future semiconductors will show a heterogeneous distribution of permanent faults as a result of fabrication variations and aging. To increase yields and lifetimes of these chips, a fault tolerance approach is required that handles resources on a small-scale basis with low overhead. In embedded systems, this overhead can be reduced by classifying data and instructions to determine the varying impact of errors on different instructions and data. Using this classification, only errors with significant impact on system behavior have to be corrected.
In this position paper, we describe one problem with this analysis, the semantic gap between high-level language source code and the low-level data flow through architecture components. In addition, we discuss possible approaches to handle this gap. Of special interest are the implications on achieving reliable execution of dependability-critical code.
|
| Olivera Jovanovic, Nils Kneuper, Peter Marwedel and Michael Engel. ILP-based Memory-Aware Mapping Optimization for MPSoCs. In The 10th IEEE/IFIP International Conference on Embedded and Ubiquitous Computing Paphos, Cyprus, December 2012 [BibTeX][PDF][Abstract]@inproceedings { jovanovic:2012b,
author = {Jovanovic, Olivera and Kneuper, Nils and Marwedel, Peter and Engel, Michael},
title = {ILP-based Memory-Aware Mapping Optimization for MPSoCs},
booktitle = {The 10th IEEE/IFIP International Conference on Embedded and Ubiquitous Computing},
year = {2012},
address = {Paphos, Cyprus},
month = {December},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2012-cse-jovanovic.pdf},
confidential = {n},
abstract = {The mapping of applications onto multiprocessor system-on-chip (MPSoC) devices is an important and complex optimization task. The goal is to efficiently distribute application tasks to available processors while optimizing for energy or runtime. Unfortunately, the influence of memories or memory hierarchies is not considered in existing mapping optimizations so far, even though it is a well-known fact that memories have a drastic impact on runtime and energy consumption of the system.
In this paper, we address the challenge of finding an efficient application to MPSoC mapping while explicitly considering the
underlying memory subsystem and an efficient mapping of task’s memory objects to memories. For this purpose, we developed a memory-aware mapping tool based on ILP optimization. Evaluations on various benchmarks show that our memory-aware mapping tool outperforms state-of-the-art mapping optimizations by reducing the runtime up to 18%, and energy consumption up to 21%.},
} The mapping of applications onto multiprocessor system-on-chip (MPSoC) devices is an important and complex optimization task. The goal is to efficiently distribute application tasks to available processors while optimizing for energy or runtime. Unfortunately, the influence of memories or memory hierarchies is not considered in existing mapping optimizations so far, even though it is a well-known fact that memories have a drastic impact on runtime and energy consumption of the system.
In this paper, we address the challenge of finding an efficient application to MPSoC mapping while explicitly considering the
underlying memory subsystem and an efficient mapping of task’s memory objects to memories. For this purpose, we developed a memory-aware mapping tool based on ILP optimization. Evaluations on various benchmarks show that our memory-aware mapping tool outperforms state-of-the-art mapping optimizations by reducing the runtime up to 18%, and energy consumption up to 21%.
|
| Sascha Plazar, Jan Kleinsorge, Heiko Falk and Peter Marwedel. WCET-aware Static Locking of Instruction Caches. In Proceedings of the International Symposium on Code Generation and Optimization (CGO), pages 44-52 San Jose, CA, USA, April 2012 [BibTeX][Link][Abstract]@inproceedings { plazar:2012:cgo,
author = {Plazar, Sascha and Kleinsorge, Jan and Falk, Heiko and Marwedel, Peter},
title = {WCET-aware Static Locking of Instruction Caches},
booktitle = {Proceedings of the International Symposium on Code Generation and Optimization (CGO)},
year = {2012},
pages = {44-52},
address = {San Jose, CA, USA},
month = {apr},
url = {http://www.uni-ulm.de/fileadmin/website_uni_ulm/iui.inst.050/profile/profil_hfalk/publications/20120402-cgo-plazar.pdf},
keywords = {wcet},
confidential = {n},
abstract = {In the past decades, embedded system designers moved from simple, predictable system designs towards complex systems equipped with caches. This step was necessary in order to bridge the increasingly growing gap between processor and memory system performance. Static analysis techniques had to be developed to allow the estimation of the cache behavior and an upper bound of the execution time of a program. This bound is called worst-case execution time (WCET). Its knowledge is crucial to verify whether hard real-time systems satisfy their timing constraints, and the WCET is a key parameter for the design of embedded systems.
In this paper, we propose a WCET-aware optimization technique for static I-cache locking which improves a program’s performance and predictability. To select the memory blocks to lock into the cache and avoid time consuming repetitive WCET analyses, we developed a new algorithm employing integer-linear programming (ILP). The ILP models the worst-case execution path (WCEP) of a program and takes the influence of locked cache contents into account. By modeling the effect of locked memory blocks on the runtime of basic blocks, the overall WCET of a program can be minimized. We show that our optimization is able to reduce the WCET of real-life benchmarks by up to 40.8%. At the same time, our proposed approach is able to outperform a regular cache by up to 23.8% in terms of WCET.},
} In the past decades, embedded system designers moved from simple, predictable system designs towards complex systems equipped with caches. This step was necessary in order to bridge the increasingly growing gap between processor and memory system performance. Static analysis techniques had to be developed to allow the estimation of the cache behavior and an upper bound of the execution time of a program. This bound is called worst-case execution time (WCET). Its knowledge is crucial to verify whether hard real-time systems satisfy their timing constraints, and the WCET is a key parameter for the design of embedded systems.
In this paper, we propose a WCET-aware optimization technique for static I-cache locking which improves a program’s performance and predictability. To select the memory blocks to lock into the cache and avoid time consuming repetitive WCET analyses, we developed a new algorithm employing integer-linear programming (ILP). The ILP models the worst-case execution path (WCEP) of a program and takes the influence of locked cache contents into account. By modeling the effect of locked memory blocks on the runtime of basic blocks, the overall WCET of a program can be minimized. We show that our optimization is able to reduce the WCET of real-life benchmarks by up to 40.8%. At the same time, our proposed approach is able to outperform a regular cache by up to 23.8% in terms of WCET.
|
| Andreas Heinig, Vincent J. Mooney, Florian Schmoll, Peter Marwedel, Krishna Palem and Michael Engel. Classification-based Improvement of Application Robustness and Quality of Service in Probabilistic Computer Systems. In Proceedings of ARCS 2012 - International Conference on Architecture of Computing Systems Munich, Germany, March 2012, -- ARCS 2012 Best Paper Award Winner -- [BibTeX][PDF][Abstract]@inproceedings { heinig:2012:arcs,
author = {Heinig, Andreas and Mooney, Vincent J. and Schmoll, Florian and Marwedel, Peter and Palem, Krishna and Engel, Michael},
title = {Classification-based Improvement of Application Robustness and Quality of Service in Probabilistic Computer Systems},
booktitle = {Proceedings of ARCS 2012 - International Conference on Architecture of Computing Systems},
year = {2012},
address = {Munich, Germany},
month = {mar},
note = {-- ARCS 2012 Best Paper Award Winner --},
keywords = {ders},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2012-arcs-heinig.pdf},
confidential = {n},
abstract = {Future semiconductors no longer guarantee permanent de- terministic operation. They are expected to show probabilistic behavior due to lowered voltages and shrinking structures. Compared to radiation-induced errors, probabilistic systems face increa- sed error frequencies leading to unexpected bit-flips. Approaches like probabilistic CMOS provide methods to control error distributions which reduce the error probability in more significant bits. However, instruc- tions handling control flow or pointers still require determinism, requiring a classification to identify these instructions.
We apply our transient error classification to probabilistic circuits using differing voltage distributions. Static analysis ensures that probabilistic effects only affect unreliable operations which accept a certain level of impreciseness, and that errors in probabilistic components will never propagate to critical operations.
To evaluate, we analyze robustness and quality-of-service of an H.264 video decoder. Using classification results, we map unreliable arithmetic operations onto probabilistic components of an MPARM model, while remaining operations use deterministic components.},
} Future semiconductors no longer guarantee permanent de- terministic operation. They are expected to show probabilistic behavior due to lowered voltages and shrinking structures. Compared to radiation-induced errors, probabilistic systems face increa- sed error frequencies leading to unexpected bit-flips. Approaches like probabilistic CMOS provide methods to control error distributions which reduce the error probability in more significant bits. However, instruc- tions handling control flow or pointers still require determinism, requiring a classification to identify these instructions.
We apply our transient error classification to probabilistic circuits using differing voltage distributions. Static analysis ensures that probabilistic effects only affect unreliable operations which accept a certain level of impreciseness, and that errors in probabilistic components will never propagate to critical operations.
To evaluate, we analyze robustness and quality-of-service of an H.264 video decoder. Using classification results, we map unreliable arithmetic operations onto probabilistic components of an MPARM model, while remaining operations use deterministic components.
|
| Sudipta Chattopadhyay, Chong Lee Kee, Abhik Roychoudhury, Timon Kelter, Heiko Falk and Peter Marwedel. A Unified WCET Analysis Framework for Multi-core Platforms. In IEEE Real-Time and Embedded Technology and Applications Symposium (RTAS), pages 99-108 Beijing, China, April 2012 [BibTeX][PDF][Link][Abstract]@inproceedings { kelter:2012:rtas,
author = {Chattopadhyay, Sudipta and Kee, Chong Lee and Roychoudhury, Abhik and Kelter, Timon and Falk, Heiko and Marwedel, Peter},
title = {A Unified WCET Analysis Framework for Multi-core Platforms},
booktitle = {IEEE Real-Time and Embedded Technology and Applications Symposium (RTAS)},
year = {2012},
pages = {99-108},
address = {Beijing, China},
month = {April},
url = {http://www.rtas.org/12-home.htm},
keywords = {wcet},
file = {http://www.comp.nus.edu.sg/~sudiptac/papers/mxtiming.pdf},
confidential = {n},
abstract = {With the advent of multi-core architectures, worst case execution time (WCET) analysis has become an increasingly difficult problem. In this paper, we propose a unified WCET analysis framework for multi-core processors featuring both shared cache and shared bus. Compared to other previous works, our work differs by modeling the interaction of shared cache and shared bus with other basic micro-architectural components (e.g. pipeline and branch predictor). In addition, our framework does not assume a timing anomaly free multicore architecture for computing the WCET. A detailed experiment methodology suggests that we can obtain reasonably tight WCET estimates in a wide range of benchmark programs.},
} With the advent of multi-core architectures, worst case execution time (WCET) analysis has become an increasingly difficult problem. In this paper, we propose a unified WCET analysis framework for multi-core processors featuring both shared cache and shared bus. Compared to other previous works, our work differs by modeling the interaction of shared cache and shared bus with other basic micro-architectural components (e.g. pipeline and branch predictor). In addition, our framework does not assume a timing anomaly free multicore architecture for computing the WCET. A detailed experiment methodology suggests that we can obtain reasonably tight WCET estimates in a wide range of benchmark programs.
|
| Helena Kotthaus, Sascha Plazar and Peter Marwedel. A JVM-based Compiler Strategy for the R Language. In Abstract Booklet at The 8th International R User Conference (UseR!) WiP, pages 68 Nashville, Tennessee, USA, June 2012 [BibTeX][PDF]@inproceedings { kotthaus:12:user,
author = {Kotthaus, Helena and Plazar, Sascha and Marwedel, Peter},
title = {A JVM-based Compiler Strategy for the R Language},
booktitle = {Abstract Booklet at The 8th International R User Conference (UseR!) WiP},
year = {2012},
pages = {68},
address = {Nashville, Tennessee, USA},
month = {jun},
keywords = {R language, Java, dynamic compiler optimization},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2012-user-kotthaus.pdf},
confidential = {n},
} |
| Daniel Cordes, Michael Engel, Peter Marwedel and Olaf Neugebauer. Automatic extraction of multi-objective aware pipeline parallelism using genetic algorithms. In Proceedings of the eighth IEEE/ACM/IFIP international conference on Hardware/software codesign and system synthesis Tampere, Finland, October 2012 [BibTeX][PDF][Abstract]@inproceedings { Cordes:2012:CODES,
author = {Cordes, Daniel and Engel, Michael and Marwedel, Peter and Neugebauer, Olaf},
title = {Automatic extraction of multi-objective aware pipeline parallelism using genetic algorithms},
booktitle = {Proceedings of the eighth IEEE/ACM/IFIP international conference on Hardware/software codesign and system synthesis},
year = {2012},
series = {CODES+ISSS '12},
address = {Tampere, Finland},
month = {oct},
publisher = {ACM},
keywords = {automatic parallelization, embedded software, energy, genetic algorithms, multi-objective, pipeline parallelism},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2012-codes-cordes.pdf},
confidential = {n},
abstract = {The development of automatic parallelization techniques has been fascinating researchers for decades. This has resulted in a significant amount of tools, which should relieve the designer from the burden of manually parallelizing an application. However, most of these tools only focus on minimizing execution time which drastically reduces their applicability to embedded devices. It is essential to find good trade-offs between different objectives like, e.g., execution time, energy consumption, or communication overhead, if applications should be parallelized for embedded multiprocessor system-on-chip (MPSoC) devices. Another important aspect which has to be taken into account is the streaming-based structure found in many embedded applications such as multimedia and network services. The best way to parallelize these applications is to extract pipeline parallelism. Therefore, this paper presents the first multi-objective aware approach exploiting pipeline parallelism automatically to make it most suitable for resource-restricted embedded devices. We have compared the new pipeline parallelization approach to an existing task-level extraction technique. The evaluation has shown that the new approach extracts very efficient multi-objective aware parallelism. In addition, the two approaches have been combined and it could be shown that both approaches perfectly complement each other.},
} The development of automatic parallelization techniques has been fascinating researchers for decades. This has resulted in a significant amount of tools, which should relieve the designer from the burden of manually parallelizing an application. However, most of these tools only focus on minimizing execution time which drastically reduces their applicability to embedded devices. It is essential to find good trade-offs between different objectives like, e.g., execution time, energy consumption, or communication overhead, if applications should be parallelized for embedded multiprocessor system-on-chip (MPSoC) devices. Another important aspect which has to be taken into account is the streaming-based structure found in many embedded applications such as multimedia and network services. The best way to parallelize these applications is to extract pipeline parallelism. Therefore, this paper presents the first multi-objective aware approach exploiting pipeline parallelism automatically to make it most suitable for resource-restricted embedded devices. We have compared the new pipeline parallelization approach to an existing task-level extraction technique. The evaluation has shown that the new approach extracts very efficient multi-objective aware parallelism. In addition, the two approaches have been combined and it could be shown that both approaches perfectly complement each other.
|
| Peter Marwedel and Michael Engel. Efficient Computing in Cyber-Physical Systems. In Proceedings of SAMOS XII July 2012 [BibTeX][PDF][Abstract]@inproceedings { marwedel:2012:samos,
author = {Marwedel, Peter and Engel, Michael},
title = {Efficient Computing in Cyber-Physical Systems},
booktitle = {Proceedings of SAMOS XII},
year = {2012},
month = {jul},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2012-samos-marwedel.pdf},
confidential = {n},
abstract = {Computing in cyber-physical systems has to be efficient in terms of a number of objectives. In particular, computing has to be execution-time and energy efficient. In this paper, we will consider optimization techniques aiming at efficiency in terms of these two objectives. In the first part, we will consider techniques for the integration of compilers and worst-case execution time (WCET) estimation. We will demonstrate, how such integration opens the door to WCET-reduction algorithms. For example, an algorithm for WCET-aware compilation reduces the WCET for an automotive application by more than 50\% by exploiting scratch pad memories (SPMs).
In the second part, we will demonstrate techniques for improving the energy efficiency of cyber-physical systems, in particular the use of SPMs. In the third part, we demonstrate how the optimization for multiple objectives taken into account. This paper provides an overview of work performed at the Chair for Embedded Systems of TU Dortmund and the Informatik Centrum Dortmund, Germany.},
} Computing in cyber-physical systems has to be efficient in terms of a number of objectives. In particular, computing has to be execution-time and energy efficient. In this paper, we will consider optimization techniques aiming at efficiency in terms of these two objectives. In the first part, we will consider techniques for the integration of compilers and worst-case execution time (WCET) estimation. We will demonstrate, how such integration opens the door to WCET-reduction algorithms. For example, an algorithm for WCET-aware compilation reduces the WCET for an automotive application by more than 50% by exploiting scratch pad memories (SPMs).
In the second part, we will demonstrate techniques for improving the energy efficiency of cyber-physical systems, in particular the use of SPMs. In the third part, we demonstrate how the optimization for multiple objectives taken into account. This paper provides an overview of work performed at the Chair for Embedded Systems of TU Dortmund and the Informatik Centrum Dortmund, Germany.
|
| Olivera Jovanovic, Peter Marwedel, Iuliana Bacivarov and Lothar Thiele. MAMOT: Memory-Aware Mapping Optimization Tool for MPSoC. In 15th Euromicro Conference on Digital System Design (DSD 2012) Izmir, Turkey, September 2012 [BibTeX]@inproceedings { Jovanovic/etal/2012a,
author = {Jovanovic, Olivera and Marwedel, Peter and Bacivarov, Iuliana and Thiele, Lothar},
title = {MAMOT: Memory-Aware Mapping Optimization Tool for MPSoC},
booktitle = {15th Euromicro Conference on Digital System Design (DSD 2012)},
year = {2012},
address = {Izmir, Turkey},
month = {September},
confidential = {n},
} |
| Jörg Henkel, Lars Bauer, Joachim Becker, Oliver Bringmann, Uwe Brinkschulte, Samarjit Chakraborty, Michael Engel, Rolf Ernst, Hermann Härtig, Lars Hedrich, Andreas Herkersdorf, Rüdiger Kapitza, Daniel Lohmann, Peter Marwedel, Marco Platzner, Wolfgang Rosenstiel, Ulf Schlichtmann, Olaf Spinczyk, Mehdi Tahoori, Jürgen Teich, Norbert Wehn and Hans-Joachim Wunderlich. Design and Architectures for Dependable Embedded Systems. In Proceedings of the International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS) Taipei, Taiwan, October 2011 [BibTeX][PDF][Abstract]@inproceedings { SPP1500:11,
author = {Henkel, J{\"o}rg and Bauer, Lars and Becker, Joachim and Bringmann, Oliver and Brinkschulte, Uwe and Chakraborty, Samarjit and Engel, Michael and Ernst, Rolf and H{\"a}rtig, Hermann and Hedrich, Lars and Herkersdorf, Andreas and Kapitza, R{\"u}diger and Lohmann, Daniel and Marwedel, Peter and Platzner, Marco and Rosenstiel, Wolfgang and Schlichtmann, Ulf and Spinczyk, Olaf and Tahoori, Mehdi and Teich, J{\"u}rgen and Wehn, Norbert and Wunderlich, Hans-Joachim},
title = {Design and Architectures for Dependable Embedded Systems},
booktitle = {Proceedings of the International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS)},
year = {2011},
address = {Taipei, Taiwan},
month = {oct},
keywords = {ders},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2011-esweek-marwedel.pdf},
confidential = {n},
abstract = {The paper presents an overview of a major research project on dependable embedded systems that has started in Fall 2010 and is running for a projected duration of six years. Aim is a `dependability co-design' that spans various levels of abstraction in the design process of embedded systems starting from gate level through operating system, applications software to system architecture. In addition, we present a new classication on faults, errors, and failures.},
} The paper presents an overview of a major research project on dependable embedded systems that has started in Fall 2010 and is running for a projected duration of six years. Aim is a `dependability co-design' that spans various levels of abstraction in the design process of embedded systems starting from gate level through operating system, applications software to system architecture. In addition, we present a new classication on faults, errors, and failures.
|
| Constantin Timm, Frank Weichert, David Fiedler, Christian Prasse, Heinrich Müller, Michael Hompel and Peter Marwedel. Decentralized Control of a Material Flow System enabled by an Embedded Computer Vision System. In Proceedings of the IEEE ICC 2011 Workshop on Embedding the Real World into the Future Internet June 2011 [BibTeX][PDF][Abstract]@inproceedings { Timm:2011a,
author = {Timm, Constantin and Weichert, Frank and Fiedler, David and Prasse, Christian and M{\"u}ller, Heinrich and Hompel, Michael and Marwedel, Peter},
title = {Decentralized Control of a Material Flow System enabled by an Embedded Computer Vision System},
booktitle = {Proceedings of the IEEE ICC 2011 Workshop on Embedding the Real World into the Future Internet},
year = {2011},
month = {jun},
file = {http://dx.doi.org/10.1109/iccw.2011.5963564},
confidential = {n},
abstract = {In this study, a novel sensor/actuator network approach for scalable automated facility logistics systems is presented. The approach comprises (1) a new sensor combination (cameras and few RFID scanners) for distributed detection, localization and identification of parcels and bins and (2) a novel middleware approach based on a service oriented architecture tailored towards the utilization in sensor/actuator networks. The latter enables a more flexible deploying of automated facility logistics system, while the former presents a novel departure for the detection and tracking of bins and parcels in automated facility logistics systems: light barriers and bar code readers are substituted by low-cost cameras, local conveyor mounted embedded evaluation units and few RFID readers. By combining vision-based systems and RFID systems, this approach can compensate for the drawbacks of each respective system. By utilizing a state-of-the-art middleware for connecting all computer systems of an automated facility logistics system the costs for deployment and reconfiguring the system can be decreased.
The paper describes image processing methods specific to the given problem to both track and read visual markers attached to parcels or bins, processing the data on an embedded system and communication/middleware aspects between different computer systems of an automated facility logistics system such as a database holding the loading and routing information of the conveyed objects as a service for the different visual sensor units. In addition, information from the RFID system is used to narrow the decision space for detection and identification. From an economic point of view this approach enables high density of identification while lowering hardware costs compared to state of the art applications and, due to decentralized control, minimizing the effort for (re-)configuration. These innovations will make automated material flow systems more cost-efficient.},
} In this study, a novel sensor/actuator network approach for scalable automated facility logistics systems is presented. The approach comprises (1) a new sensor combination (cameras and few RFID scanners) for distributed detection, localization and identification of parcels and bins and (2) a novel middleware approach based on a service oriented architecture tailored towards the utilization in sensor/actuator networks. The latter enables a more flexible deploying of automated facility logistics system, while the former presents a novel departure for the detection and tracking of bins and parcels in automated facility logistics systems: light barriers and bar code readers are substituted by low-cost cameras, local conveyor mounted embedded evaluation units and few RFID readers. By combining vision-based systems and RFID systems, this approach can compensate for the drawbacks of each respective system. By utilizing a state-of-the-art middleware for connecting all computer systems of an automated facility logistics system the costs for deployment and reconfiguring the system can be decreased.
The paper describes image processing methods specific to the given problem to both track and read visual markers attached to parcels or bins, processing the data on an embedded system and communication/middleware aspects between different computer systems of an automated facility logistics system such as a database holding the loading and routing information of the conveyed objects as a service for the different visual sensor units. In addition, information from the RFID system is used to narrow the decision space for detection and identification. From an economic point of view this approach enables high density of identification while lowering hardware costs compared to state of the art applications and, due to decentralized control, minimizing the effort for (re-)configuration. These innovations will make automated material flow systems more cost-efficient.
|
| Constantin Timm, Pascal Libuschewski, Dominic Siedhoff, Frank Weichert, Heinrich Müller and Peter Marwedel. Improving Nanoobject Detection in Optical Biosensor Data. In Proceedings of the 5th International Symposium on Bio- and Medical Informatics and Cybernetics, BMIC 2011 July 2011 [BibTeX][PDF][Abstract]@inproceedings { Timm:2011b,
author = {Timm, Constantin and Libuschewski, Pascal and Siedhoff, Dominic and Weichert, Frank and M{\"u}ller, Heinrich and Marwedel, Peter},
title = {Improving Nanoobject Detection in Optical Biosensor Data},
booktitle = {Proceedings of the 5th International Symposium on Bio- and Medical Informatics and Cybernetics, BMIC 2011},
year = {2011},
month = {July},
file = {http://www.iiis.org/CDs2011/CD2011SCI/BMIC_2011/PapersPdf/BA536CW.pdf},
confidential = {n},
abstract = {The importance of real-time capable mobile biosensors increases in face of rising numbers of global virus epidemics. Such biosensors can be used for on-site diagnosis, e.g. at airports, to prevent further spread of virus-transmitted diseases, by answering the question whether or not a sample contains a certain virus. In-depth laboratory analysis might furthermore demand for measurements of the concentration of virus particles in a sample. The novel PAMONO sensor technique allows for accomplishing both tasks. One of its basic prerequisites is an efficient analysis of the biosensor image data by means of digital image processing and classification. In this study, we present a high performance approach to this analysis: The diagnosis whether a virus occurs in the sample can be carried out in real-time with high accuracy. An estimate of the concentration can be obtained in real-time as well, if that concentration is not too high.
The contribution of this work is an optimization of our processing pipeline used for PAMONO sensor data analysis. The following objectives are optimized: detection-quality, speed and consumption of resources (e.g. energy, memory). Thus our approach respects the constraints imposed by medical applicability, as well as the constraints on resource consumption arising in embedded systems. The parameters to be optimized are descriptive (virus appearance parameters) and hardware-related (design space exploration).
},
} The importance of real-time capable mobile biosensors increases in face of rising numbers of global virus epidemics. Such biosensors can be used for on-site diagnosis, e.g. at airports, to prevent further spread of virus-transmitted diseases, by answering the question whether or not a sample contains a certain virus. In-depth laboratory analysis might furthermore demand for measurements of the concentration of virus particles in a sample. The novel PAMONO sensor technique allows for accomplishing both tasks. One of its basic prerequisites is an efficient analysis of the biosensor image data by means of digital image processing and classification. In this study, we present a high performance approach to this analysis: The diagnosis whether a virus occurs in the sample can be carried out in real-time with high accuracy. An estimate of the concentration can be obtained in real-time as well, if that concentration is not too high.
The contribution of this work is an optimization of our processing pipeline used for PAMONO sensor data analysis. The following objectives are optimized: detection-quality, speed and consumption of resources (e.g. energy, memory). Thus our approach respects the constraints imposed by medical applicability, as well as the constraints on resource consumption arising in embedded systems. The parameters to be optimized are descriptive (virus appearance parameters) and hardware-related (design space exploration).
|
| Michael Engel, Florian Schmoll, Andreas Heinig and Peter Marwedel. Temporal Properties of Error Handling for Multimedia Applications. In Proceedings of the 14th ITG Conference on Electronic Media Technology Dortmund / Germany, February 2011 [BibTeX][PDF][Abstract]@inproceedings { engel:11:itg,
author = {Engel, Michael and Schmoll, Florian and Heinig, Andreas and Marwedel, Peter},
title = {Temporal Properties of Error Handling for Multimedia Applications},
booktitle = {Proceedings of the 14th ITG Conference on Electronic Media Technology},
year = {2011},
address = {Dortmund / Germany},
month = {feb},
keywords = {ders},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2011-itg-engel.pdf},
confidential = {n},
abstract = {In embedded consumer electronics devices, cost pressure is one of the driving design objectives. Devices that handle multimedia information, like DVD players or digital video cameras require high computing performance and real- time capabilities while adhering to the cost restrictions. The cost pressure often results in system designs that barely exceed the minimum requirements for such a system.
Thus, hardware-based fault tolerance methods frequently are ignored due to their cost overhead. However, the amount of transient faults showing up in semiconductor-based systems is expected to increase sharply in the near future. Thus, low- overhead methods to correct related errors in such systems are required. Considering restrictions in processing speed, the real-time properties of a system with added error handling are of special interest. In this paper, we present our approach to flexible error handling and discuss the challenges as well as the inherent timing dependencies to deploy it in a typical soft real- time multimedia system, a H.264 video decoder.},
} In embedded consumer electronics devices, cost pressure is one of the driving design objectives. Devices that handle multimedia information, like DVD players or digital video cameras require high computing performance and real- time capabilities while adhering to the cost restrictions. The cost pressure often results in system designs that barely exceed the minimum requirements for such a system.
Thus, hardware-based fault tolerance methods frequently are ignored due to their cost overhead. However, the amount of transient faults showing up in semiconductor-based systems is expected to increase sharply in the near future. Thus, low- overhead methods to correct related errors in such systems are required. Considering restrictions in processing speed, the real-time properties of a system with added error handling are of special interest. In this paper, we present our approach to flexible error handling and discuss the challenges as well as the inherent timing dependencies to deploy it in a typical soft real- time multimedia system, a H.264 video decoder.
|
| Michael Engel, Florian Schmoll, Andreas Heinig and Peter Marwedel. Unreliable yet Useful -- Reliability Annotations for Data in Cyber-Physical Systems. In Proceedings of the 2011 Workshop on Software Language Engineering for Cyber-physical Systems (WS4C) Berlin / Germany, October 2011 [BibTeX][PDF][Abstract]@inproceedings { engel:11:ws4c,
author = {Engel, Michael and Schmoll, Florian and Heinig, Andreas and Marwedel, Peter},
title = {Unreliable yet Useful -- Reliability Annotations for Data in Cyber-Physical Systems},
booktitle = {Proceedings of the 2011 Workshop on Software Language Engineering for Cyber-physical Systems (WS4C)},
year = {2011},
address = {Berlin / Germany},
month = {oct},
keywords = {ders},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2011-ws4c-engel.pdf},
confidential = {n},
abstract = {Today, cyber-physical systems face yet another challenge in addition to the traditional constraints in energy, computing power, or memory. Shrinking semiconductor structure sizes and supply voltages imply that the number of errors that manifest themselves in a system will rise significantly. Most CP systems have to survive errors, but many systems do not have sufficient resources to correct all errors that show up. Thus, it is important to spend the available resources on handling errors with the most critical effect. We propose an ``unreliability'' annotation for data types in C programs that indicates if an error showing up in a specific variable or data structure will possibly cause a severe problem like a program crash or might only show rather negligible effects, e.g., a discolored pixel in video decoding. This classification of data is supported by static analysis methods that verify if the value contained in a variable marked as unreliable does not end up as part of a critical operation, e.g., an array index or loop termination condition. This classification enables several approaches to flexible error handling. For example, a CP system designer might choose to selectively safeguard variables marked as non-unreliable or to employ memories with different reliabilty properties to store the respective values.},
} Today, cyber-physical systems face yet another challenge in addition to the traditional constraints in energy, computing power, or memory. Shrinking semiconductor structure sizes and supply voltages imply that the number of errors that manifest themselves in a system will rise significantly. Most CP systems have to survive errors, but many systems do not have sufficient resources to correct all errors that show up. Thus, it is important to spend the available resources on handling errors with the most critical effect. We propose an "unreliability" annotation for data types in C programs that indicates if an error showing up in a specific variable or data structure will possibly cause a severe problem like a program crash or might only show rather negligible effects, e.g., a discolored pixel in video decoding. This classification of data is supported by static analysis methods that verify if the value contained in a variable marked as unreliable does not end up as part of a critical operation, e.g., an array index or loop termination condition. This classification enables several approaches to flexible error handling. For example, a CP system designer might choose to selectively safeguard variables marked as non-unreliable or to employ memories with different reliabilty properties to store the respective values.
|
| Constantin Timm, Frank Weichert, Peter Marwedel and Heinrich Müller. Multi-Objective Local Instruction Scheduling for GPGPU Applications. In Proceedings of the International Conference on Parallel and Distributed Computing Systems 2011 (PDCS) Dallas, USA, December 2011 [BibTeX][PDF][Abstract]@inproceedings { timm:2011:pdcs,
author = {Timm, Constantin and Weichert, Frank and Marwedel, Peter and M{\"u}ller, Heinrich},
title = {Multi-Objective Local Instruction Scheduling for GPGPU Applications},
booktitle = {Proceedings of the International Conference on Parallel and Distributed Computing Systems 2011 (PDCS) },
year = {2011},
address = {Dallas, USA},
month = {December},
publisher = {IASTED/ACTA Press},
file = {http://www.actapress.com/PaperInfo.aspx?paperId=453074},
confidential = {n},
abstract = {In this paper, a new optimization approach (MOLIS: Multi-Objective Local Instruction Scheduling) is presented which maximizes the performance and minimizes the energy consumption of GPGPU applications. The design process of writing efficient GPGPU applications is time-consuming. This disadvantage mainly arises from the fact that the optimization of an application is accomplished in an expensive trial-and-error manner without efficient compiler support. Especially, efficient register utilization and load balancing of the concurrently working instruction and memory pipelines were not considered in the compile process up to now. Another drawback of the state-of-the-art GPGPU application design process is that energy consumption is not taken into account, which is important in the face of green computing. In order to optimize performance and energy consumption simultaneously, a multi-objective genetic algorithm was utilized. The optimization of GPGPU applications in MOLIS employs local instruction scheduling methods. The optimization potential of MOLIS was evaluated by profiling the runtime and the energy consumption on a real platform. The optimization approach was tested with several real-world benchmarks stemming from the Nvidia CUDA examples, the VSIPL-GPGPU-Library and the Rodinia benchmark suite. By applying MOLIS to the real-world benchmarks, up to 9% energy and 12% runtime can be saved.},
} In this paper, a new optimization approach (MOLIS: Multi-Objective Local Instruction Scheduling) is presented which maximizes the performance and minimizes the energy consumption of GPGPU applications. The design process of writing efficient GPGPU applications is time-consuming. This disadvantage mainly arises from the fact that the optimization of an application is accomplished in an expensive trial-and-error manner without efficient compiler support. Especially, efficient register utilization and load balancing of the concurrently working instruction and memory pipelines were not considered in the compile process up to now. Another drawback of the state-of-the-art GPGPU application design process is that energy consumption is not taken into account, which is important in the face of green computing. In order to optimize performance and energy consumption simultaneously, a multi-objective genetic algorithm was utilized. The optimization of GPGPU applications in MOLIS employs local instruction scheduling methods. The optimization potential of MOLIS was evaluated by profiling the runtime and the energy consumption on a real platform. The optimization approach was tested with several real-world benchmarks stemming from the Nvidia CUDA examples, the VSIPL-GPGPU-Library and the Rodinia benchmark suite. By applying MOLIS to the real-world benchmarks, up to 9% energy and 12% runtime can be saved.
|
| Timon Kelter, Heiko Falk, Peter Marwedel, Sudipta Chattopadhyay and Abhik Roychoudhury. Bus-Aware Multicore WCET Analysis through TDMA Offset Bounds. In Proceedings of the 23rd Euromicro Conference on Real-Time Systems (ECRTS), pages 3-12 Porto / Portugal, July 2011 [BibTeX][PDF][Abstract]@inproceedings { kelter:11:ecrts,
author = {Kelter, Timon and Falk, Heiko and Marwedel, Peter and Chattopadhyay, Sudipta and Roychoudhury, Abhik},
title = {Bus-Aware Multicore WCET Analysis through TDMA Offset Bounds},
booktitle = {Proceedings of the 23rd Euromicro Conference on Real-Time Systems (ECRTS)},
year = {2011},
pages = {3-12},
address = {Porto / Portugal},
month = {jul},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2011-ecrts_1.pdf},
confidential = {n},
abstract = {In the domain of real-time systems, the analysis of the timing behavior of programs is crucial for guaranteeing the schedulability and thus the safeness of a system. Static analyses of the \textit{WCET} (Worst-Case Execution Time) have proven to be a key element for timing analysis, as they provide safe upper bounds on a program's execution time. For single-core systems, industrial-strength WCET analyzers are already available, but up to now, only first proposals have been made to analyze the WCET in multicore systems, where the different cores may interfere during the access to shared resources. An important example for this are shared buses which connect the cores to a shared main memory. The time to gain access to the shared bus may vary significantly, depending on the used bus arbitration protocol and the access timings. In this paper, we propose a new technique for analyzing the duration of accesses to shared buses. We implemented a prototype tool which uses the new analysis and tested it on a set of realworld benchmarks. Results demonstrate that our analysis achieves the same precision as the best existing approach while drastically outperforming it in matters of analysis time.},
} In the domain of real-time systems, the analysis of the timing behavior of programs is crucial for guaranteeing the schedulability and thus the safeness of a system. Static analyses of the WCET (Worst-Case Execution Time) have proven to be a key element for timing analysis, as they provide safe upper bounds on a program's execution time. For single-core systems, industrial-strength WCET analyzers are already available, but up to now, only first proposals have been made to analyze the WCET in multicore systems, where the different cores may interfere during the access to shared resources. An important example for this are shared buses which connect the cores to a shared main memory. The time to gain access to the shared bus may vary significantly, depending on the used bus arbitration protocol and the access timings. In this paper, we propose a new technique for analyzing the duration of accesses to shared buses. We implemented a prototype tool which uses the new analysis and tested it on a set of realworld benchmarks. Results demonstrate that our analysis achieves the same precision as the best existing approach while drastically outperforming it in matters of analysis time.
|
| Sascha Plazar, Jan C. Kleinsorge, Heiko Falk and Peter Marwedel. WCET-driven Branch Prediction aware Code Positioning. In Proceedings of the International Conference on Compilers, Architectures and Synthesis for Embedded Systems (CASES), pages 165-174 Taipei, Taiwan, October 2011 [BibTeX][PDF][Abstract]@inproceedings { plazar:11:cases,
author = {Plazar, Sascha and Kleinsorge, Jan C. and Falk, Heiko and Marwedel, Peter},
title = {WCET-driven Branch Prediction aware Code Positioning},
booktitle = {Proceedings of the International Conference on Compilers, Architectures and Synthesis for Embedded Systems (CASES)},
year = {2011},
pages = {165-174},
address = {Taipei, Taiwan},
month = {oct},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2011-cases_2.pdf},
confidential = {n},
abstract = {In the past decades, embedded system designers moved from simple, predictable system designs towards complex systems equipped with caches, branch prediction units and speculative execution. This step was necessary in order to fulfill increasing requirements on computational power. Static analysis techniques considering such speculative units had to be developed to allow the estimation of an upper bound of the execution time of a program. This bound is called worst-case execution time (WCET). Its knowledge is crucial to verify whether hard real-time systems satisfy their timing constraints, and the WCET is a key parameter for the design of embedded systems.
In this paper, we propose a WCET-driven branch prediction aware optimization which reorders basic blocks of a function in order to reduce the amount of jump instructions and mispredicted branches. We employed a genetic algorithm which rearranges basic blocks in order to decrease the WCET of a program. This enables a first estimation of the possible optimization potential at the cost of high optimization runtimes. To avoid time consuming repetitive WCET analyses, we developed a new algorithm employing integer-linear programming (ILP). The ILP models the worst-case execution path (WCEP) of a program and takes branch prediction effects into account. This algorithm enables short optimization runtimes at slightly decreased optimization results. In a case study, the genetic algorithm is able to reduce the benchmarks’ WCET by up to 24.7% whereas our ILP-based approach is able to decrease the WCET by up to 20.0%.
},
} In the past decades, embedded system designers moved from simple, predictable system designs towards complex systems equipped with caches, branch prediction units and speculative execution. This step was necessary in order to fulfill increasing requirements on computational power. Static analysis techniques considering such speculative units had to be developed to allow the estimation of an upper bound of the execution time of a program. This bound is called worst-case execution time (WCET). Its knowledge is crucial to verify whether hard real-time systems satisfy their timing constraints, and the WCET is a key parameter for the design of embedded systems.
In this paper, we propose a WCET-driven branch prediction aware optimization which reorders basic blocks of a function in order to reduce the amount of jump instructions and mispredicted branches. We employed a genetic algorithm which rearranges basic blocks in order to decrease the WCET of a program. This enables a first estimation of the possible optimization potential at the cost of high optimization runtimes. To avoid time consuming repetitive WCET analyses, we developed a new algorithm employing integer-linear programming (ILP). The ILP models the worst-case execution path (WCEP) of a program and takes branch prediction effects into account. This algorithm enables short optimization runtimes at slightly decreased optimization results. In a case study, the genetic algorithm is able to reduce the benchmarks’ WCET by up to 24.7% whereas our ILP-based approach is able to decrease the WCET by up to 20.0%.
|
| Daniel Cordes, Andreas Heinig, Peter Marwedel and Arindam Mallik. Automatic Extraction of Pipeline Parallelism for Embedded Software Using Linear Programming. In Proceedings of the 17th IEEE International Conference on Parallel and Distributed Systems (ICPADS), 2011, pages 699 -706 Tainan, Taiwan, December 2011 [BibTeX][PDF][Abstract]@inproceedings { cordes:2011:icpads,
author = {Cordes, Daniel and Heinig, Andreas and Marwedel, Peter and Mallik, Arindam},
title = {Automatic Extraction of Pipeline Parallelism for Embedded Software Using Linear Programming},
booktitle = {Proceedings of the 17th IEEE International Conference on Parallel and Distributed Systems (ICPADS), 2011},
year = {2011},
pages = {699 -706},
address = {Tainan, Taiwan},
month = {dec},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2011-icpads-cordes.pdf},
confidential = {n},
abstract = {The complexity and performance requirements of embedded software are continuously increasing, making Multiprocessor System-on-Chip (MPSoC) architectures more and more important in the domain of embedded and cyber-physical systems. Using multiple cores in a single system reduces problems concerning energy consumption, heat dissipation, and increases performance. Nevertheless, these benefits do not come for free. Porting existing, mostly sequential, applications to MPSoCs requires extracting efficient parallelism to utilize all available cores. Many embedded applications, like network services and multimedia tasks for voice-, image- and video processing, are operating on data streams and thus have a streaming-based structure. Despite the abundance of parallelism in streaming applications, it is a non-trivial task to split and efficiently map sequential applications to MPSoCs. Therefore, we present an algorithm which automatically extracts pipeline parallelism from sequential ANSI-C applications. The presented tool employs an integer linear programming (ILP) based approach enriched with an adequate cost model to automatically control the granularity of the parallelization. By applying our tool to real-life applications, it can be shown that our approach is able to speed up applications by a factor of up to 3.9x on a four-core MPSoC architecture, compared to a sequential execution.},
} The complexity and performance requirements of embedded software are continuously increasing, making Multiprocessor System-on-Chip (MPSoC) architectures more and more important in the domain of embedded and cyber-physical systems. Using multiple cores in a single system reduces problems concerning energy consumption, heat dissipation, and increases performance. Nevertheless, these benefits do not come for free. Porting existing, mostly sequential, applications to MPSoCs requires extracting efficient parallelism to utilize all available cores. Many embedded applications, like network services and multimedia tasks for voice-, image- and video processing, are operating on data streams and thus have a streaming-based structure. Despite the abundance of parallelism in streaming applications, it is a non-trivial task to split and efficiently map sequential applications to MPSoCs. Therefore, we present an algorithm which automatically extracts pipeline parallelism from sequential ANSI-C applications. The presented tool employs an integer linear programming (ILP) based approach enriched with an adequate cost model to automatically control the granularity of the parallelization. By applying our tool to real-life applications, it can be shown that our approach is able to speed up applications by a factor of up to 3.9x on a four-core MPSoC architecture, compared to a sequential execution.
|
| Peter Marwedel and Michael Engel. Embedded System Design 2.0: Rationale Behind a Textbook Revision. In Proceedings of Workshop on Embedded Systems Education (WESE) Taipei, Taiwan, October 2011 [BibTeX][PDF][Abstract]@inproceedings { marwedel:2011:wese,
author = {Marwedel, Peter and Engel, Michael},
title = {Embedded System Design 2.0: Rationale Behind a Textbook Revision},
booktitle = {Proceedings of Workshop on Embedded Systems Education (WESE)},
year = {2011},
address = {Taipei, Taiwan},
month = {October},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2011-wese-marwedel.pdf},
confidential = {n},
abstract = {Seven years after its first release, it became necessary to publish a new edition of the author’s text book on embedded system
design. This paper explains the key changes that were incorporated into the second edition. These changes reflect seven years of teaching of the subject, with two courses every year. The rationale behind these changes can also be found in the paper. In this way, the paper also reflects changes in the area over time, while the area becomes more mature. The paper helps understanding why a particular topic is included in this curriculum for embedded system design and why a certain structure of the course is suggested.},
} Seven years after its first release, it became necessary to publish a new edition of the author’s text book on embedded system
design. This paper explains the key changes that were incorporated into the second edition. These changes reflect seven years of teaching of the subject, with two courses every year. The rationale behind these changes can also be found in the paper. In this way, the paper also reflects changes in the area over time, while the area becomes more mature. The paper helps understanding why a particular topic is included in this curriculum for embedded system design and why a certain structure of the course is suggested.
|
| Jan C. Kleinsorge, Heiko Falk and Peter Marwedel. A Synergetic Approach To Accurate Analysis Of Cache-Related Preemption Delay. In Proceedings of the International Conference on Embedded Software (EMSOFT), pages 329-338 Taipei, Taiwan, October 2011 [BibTeX][PDF][Abstract]@inproceedings { kleinsorge:11:emsoft,
author = {Kleinsorge, Jan C. and Falk, Heiko and Marwedel, Peter},
title = {A Synergetic Approach To Accurate Analysis Of Cache-Related Preemption Delay},
booktitle = {Proceedings of the International Conference on Embedded Software (EMSOFT)},
year = {2011},
pages = {329-338},
address = {Taipei, Taiwan},
month = {oct},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2011-emsoft.pdf},
confidential = {n},
abstract = {The worst-case execution time (WCET) of a task denotes the largest possible execution time for all possible inputs and thus, hardware states. For non-preemptive multitask scheduling, techniques for the static estimation of safe upper bounds have been subject to industrial practice for years. For preemptive scheduling however, the isolated analysis of tasks becomes imprecise as interferences among tasks cannot be considered with sufficient precision. For such scenarios, the cache-related preemption delay (CRPD) denotes a key metric as it reflects the eects of preemptions on the execution behavior of a single task. Until recently, proposals for CRPD analyses were often limited to direct mapped caches or comparably imprecise for k-way set-associative caches.
In this paper, we propose how the current best techniques for CRPD analysis, which have only been proposed separately and for dierent aspects of the analysis can be brought together to construct an efficient CRPD analysis with unique properties. Moreover, along the construction, we propose several different enhancements to the methods employed. We also exploit that in a complete approach, analysis steps are synergetic and can be combined into a single analysis pass solving all formerly separate steps at once. In addition, we argue that it is often sufficient to carry out the combined analysis on basic block bounds, which further lowers the overall complexity. The result is a proposal for a fast CRPD analysis of very high accuracy.
},
} The worst-case execution time (WCET) of a task denotes the largest possible execution time for all possible inputs and thus, hardware states. For non-preemptive multitask scheduling, techniques for the static estimation of safe upper bounds have been subject to industrial practice for years. For preemptive scheduling however, the isolated analysis of tasks becomes imprecise as interferences among tasks cannot be considered with sufficient precision. For such scenarios, the cache-related preemption delay (CRPD) denotes a key metric as it reflects the eects of preemptions on the execution behavior of a single task. Until recently, proposals for CRPD analyses were often limited to direct mapped caches or comparably imprecise for k-way set-associative caches.
In this paper, we propose how the current best techniques for CRPD analysis, which have only been proposed separately and for dierent aspects of the analysis can be brought together to construct an efficient CRPD analysis with unique properties. Moreover, along the construction, we propose several different enhancements to the methods employed. We also exploit that in a complete approach, analysis steps are synergetic and can be combined into a single analysis pass solving all formerly separate steps at once. In addition, we argue that it is often sufficient to carry out the combined analysis on basic block bounds, which further lowers the overall complexity. The result is a proposal for a fast CRPD analysis of very high accuracy.
|
| Peter Marwedel, Jürgen Teich, Georgia Kouveli, Iuliana Bacivarov, Lothar Thiele, Soonhoi Ha, Chanhee Lee, Qiang Xu and Lin Huang. Mapping of Applications to MPSoCs. In Proceedings of the International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS) Taipei, Taiwan, October 2011 [BibTeX][PDF][Abstract]@inproceedings { marwedel:2011:codes-isss2,
author = {Marwedel, Peter and Teich, J\"urgen and Kouveli, Georgia and Bacivarov, Iuliana and Thiele, Lothar and Ha, Soonhoi and Lee, Chanhee and Xu, Qiang and Huang, Lin},
title = {Mapping of Applications to MPSoCs},
booktitle = {Proceedings of the International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS)},
year = {2011},
address = {Taipei, Taiwan},
month = {October},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2011-codes-isss-marwedel.pdf},
confidential = {n},
abstract = {The advent of embedded many-core architectures results in the need to come up with techniques for mapping embedded applications onto such architectures. This paper presents a representative set of such techniques. The techniques focus on optimizing performance, temperature distribution, reliability and fault tolerance for various models.},
} The advent of embedded many-core architectures results in the need to come up with techniques for mapping embedded applications onto such architectures. This paper presents a representative set of such techniques. The techniques focus on optimizing performance, temperature distribution, reliability and fault tolerance for various models.
|
| Robert Pyka, Felipe Klein, Peter Marwedel and Stylianos Mamagkakis. Versatile System-Level Memory-Aware Platform Description Approach for Embedded MPSoCs. In Proc. of the ACM SIGPLAN/SIGBED 2010 Conference on Languages, Compilers, and Tools for Embedded Systems, pages 9-16 2010 [BibTeX][Abstract]@inproceedings { pyka:2010,
author = {Pyka, Robert and Klein, Felipe and Marwedel, Peter and Mamagkakis, Stylianos},
title = {Versatile System-Level Memory-Aware Platform Description Approach for Embedded MPSoCs},
booktitle = {Proc. of the ACM SIGPLAN/SIGBED 2010 Conference on Languages, Compilers, and Tools for Embedded Systems},
year = {2010},
pages = {9-16},
publisher = {ACM},
confidential = {n},
abstract = {In this paper, we present a novel system modeling language which targets primarily the development of source-level multiprocessor memory aware optimizations. In contrast to previous system modeling approaches this approach tries to model the whole system and especially the memory hierarchy in a structural and semantically accessible way. Previous approaches primarily support generation of simulators or retargetable code selectors and thus concentrate on pure behavioral models or describe only the processor instruction set in a semantically accessible way, A simple, database-like, interface is offered to the optimization developer, which in conjunction with the MACCv2 framework enables rapid development of source-level architecture independent optimizations.},
} In this paper, we present a novel system modeling language which targets primarily the development of source-level multiprocessor memory aware optimizations. In contrast to previous system modeling approaches this approach tries to model the whole system and especially the memory hierarchy in a structural and semantically accessible way. Previous approaches primarily support generation of simulators or retargetable code selectors and thus concentrate on pure behavioral models or describe only the processor instruction set in a semantically accessible way, A simple, database-like, interface is offered to the optimization developer, which in conjunction with the MACCv2 framework enables rapid development of source-level architecture independent optimizations.
|
| Peter Marwedel and Michael Engel. Plea for a Holistic Analysis of the Relationship between Information Technology and Carbon-Dioxide Emissions. In Workshop on Energy-aware Systems and Methods (GI-ITG) Hanover / Germany, February 2010 [BibTeX][PDF][Abstract]@inproceedings { marwedel:10:GI,
author = {Marwedel, Peter and Engel, Michael},
title = {Plea for a Holistic Analysis of the Relationship between Information Technology and Carbon-Dioxide Emissions},
booktitle = {Workshop on Energy-aware Systems and Methods (GI-ITG)},
year = {2010},
address = {Hanover / Germany},
month = {feb},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/arcs-10-marwedel.pdf},
confidential = {n},
abstract = {An analysis of the relationship between information technology (IT) and carbon-dioxide (CO2) emissions should not be constrained to an analysis of emissions caused during the operation of IT equipment. Rather, an analysis of emissions should be based on a full life-cycle assessment (LCA) of IT systems, from their conception until their recycling. Also, the reduction of emissions through the use of IT systems should not be forgotten. This paper explains these viewpoints in more detail and provides rough life-cycle analyses of personal computers (PCs). It will be shown that|for standard scenarios|emissions from PC production are exceeding those of their shipment and use. This stresses the importance of using PCs as long as possible.},
} An analysis of the relationship between information technology (IT) and carbon-dioxide (CO2) emissions should not be constrained to an analysis of emissions caused during the operation of IT equipment. Rather, an analysis of emissions should be based on a full life-cycle assessment (LCA) of IT systems, from their conception until their recycling. Also, the reduction of emissions through the use of IT systems should not be forgotten. This paper explains these viewpoints in more detail and provides rough life-cycle analyses of personal computers (PCs). It will be shown that|for standard scenarios|emissions from PC production are exceeding those of their shipment and use. This stresses the importance of using PCs as long as possible.
|
| Constantin Timm, Andrej Gelenberg, Peter Marwedel and Frank Weichert. Energy Considerations within the Integration of General Purpose GPUs in Embedded Systems. In Proceedings of the International Conference on Advances in Distributed and Parallel Computing November 2010 [BibTeX]@inproceedings { timm:2010:adpc,
author = {Timm, Constantin and Gelenberg, Andrej and Marwedel, Peter and Weichert, Frank},
title = {Energy Considerations within the Integration of General Purpose GPUs in Embedded Systems},
booktitle = {Proceedings of the International Conference on Advances in Distributed and Parallel Computing},
year = {2010},
month = {November},
publisher = {Global Science \& Technology Forum},
confidential = {n},
} |
| Daniel Cordes, Peter Marwedel and Arindam Mallik. Automatic Parallelization of Embedded Software Using Hierarchical Task Graphs and Integer Linear Programming. In Proceedings of the eighth IEEE/ACM/IFIP international conference on Hardware/software codesign and system synthesis (CODES+ISSS 2010) Scottsdale / US, October 2010 [BibTeX][PDF][Abstract]@inproceedings { cordes:10:CODES,
author = {Cordes, Daniel and Marwedel, Peter and Mallik, Arindam},
title = {Automatic Parallelization of Embedded Software Using Hierarchical Task Graphs and Integer Linear Programming},
booktitle = {Proceedings of the eighth IEEE/ACM/IFIP international conference on Hardware/software codesign and system synthesis (CODES+ISSS 2010)},
year = {2010},
address = {Scottsdale / US},
month = {oct},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2010-codes-cordes.pdf},
confidential = {n},
abstract = {The last years have shown that there is no way to disregard the advantages provided by multiprocessor System-on-Chip (MPSoC) architectures in the embedded systems domain. Using multiple cores in a single system enables to close the gap between energy consumption, problems concerning heat dissipation, and computational power. Nevertheless, these benefits do not come for free. New challenges arise, if existing applications have to be ported to these multiprocessor platforms. One of the most ambitious tasks is to extract efficient parallelism from these existing sequential applications. Hence, many parallelization tools have been developed, most of them are extracting as much parallelism as possible, which is in general not the best choice for embedded systems with their limitations in hardware and software support. In contrast to previous approaches, we present a new automatic parallelization tool, tailored to the particular requirements of the resource constrained embedded systems. Therefore, this paper presents an algorithm which automatically steers the granularity of the generated tasks, with respect to architectural requirements and the overall execution time reduction. For this purpose, we exploit hierarchical task graphs to simplify a new integer linear programming based approach in order to split up sequential programs in an efficient way. Results on real-life benchmarks have shown that the presented approach is able to speed sequential applications up by a factor of up to 3.7 on a four core MPSoC architecture.},
} The last years have shown that there is no way to disregard the advantages provided by multiprocessor System-on-Chip (MPSoC) architectures in the embedded systems domain. Using multiple cores in a single system enables to close the gap between energy consumption, problems concerning heat dissipation, and computational power. Nevertheless, these benefits do not come for free. New challenges arise, if existing applications have to be ported to these multiprocessor platforms. One of the most ambitious tasks is to extract efficient parallelism from these existing sequential applications. Hence, many parallelization tools have been developed, most of them are extracting as much parallelism as possible, which is in general not the best choice for embedded systems with their limitations in hardware and software support. In contrast to previous approaches, we present a new automatic parallelization tool, tailored to the particular requirements of the resource constrained embedded systems. Therefore, this paper presents an algorithm which automatically steers the granularity of the generated tasks, with respect to architectural requirements and the overall execution time reduction. For this purpose, we exploit hierarchical task graphs to simplify a new integer linear programming based approach in order to split up sequential programs in an efficient way. Results on real-life benchmarks have shown that the presented approach is able to speed sequential applications up by a factor of up to 3.7 on a four core MPSoC architecture.
|
| Peter Marwedel and Michael Engel. Ein Plädoyer für eine holistische Analyse der Zusammenhänge zwischen Informationstechnologie und Kohlendioxyd-Emissionen. In VDE-Kongress Leipzig, Germany, November 2010 [BibTeX]@inproceedings { marwedel:10:VDE,
author = {Marwedel, Peter and Engel, Michael},
title = {Ein Pl{\"a}doyer f{\"u}r eine holistische Analyse der Zusammenh{\"a}nge zwischen Informationstechnologie und Kohlendioxyd-Emissionen},
booktitle = {VDE-Kongress},
year = {2010},
address = {Leipzig, Germany},
month = {nov},
confidential = {n},
} |
| Sascha Plazar, Peter Marwedel and Jörg Rahnenführer. Optimizing Execution Runtimes of R Programs. In Book of Abstracts of International Symposium on Business and Industrial Statistics (ISBIS), pages 81-82 Portoroz (Portorose) / Slovenia, July 2010 [BibTeX][PDF]@inproceedings { plazar:10:isbis,
author = {Plazar, Sascha and Marwedel, Peter and Rahnenf\ührer, J\örg},
title = {Optimizing Execution Runtimes of R Programs},
booktitle = {Book of Abstracts of International Symposium on Business and Industrial Statistics (ISBIS)},
year = {2010},
pages = {81-82},
address = {Portoroz (Portorose) / Slovenia},
month = {jul},
keywords = {rcs},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2010-isbis.pdf},
confidential = {n},
} |
| Sascha Plazar, Paul Lokuciejewski and Peter Marwedel. WCET-driven Cache-aware Memory Content Selection. In Proceedings of the 13th IEEE International Symposium on Object/Component/Service-oriented Real-time Distributed Computing (ISORC), pages 107-114 Carmona / Spain, May 2010 [BibTeX][PDF][Abstract]@inproceedings { plazar:10:isorc,
author = {Plazar, Sascha and Lokuciejewski, Paul and Marwedel, Peter},
title = {WCET-driven Cache-aware Memory Content Selection},
booktitle = {Proceedings of the 13th IEEE International Symposium on Object/Component/Service-oriented Real-time Distributed Computing (ISORC)},
year = {2010},
pages = {107-114},
address = {Carmona / Spain},
month = {may},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2010-isorc.pdf},
confidential = {n},
abstract = {Caches are widely used to bridge the increasingly growing gap between processor and memory performance. They store copies of frequently used parts of the slow main memory for faster access. Static analysis techniques allow the estimation of the worst-case cache behavior and enable the computation of an upper bound of the execution time of a program. This bound is called worst-case execution time (WCET). Its knowledge is crucial to verify if hard real-time systems satisfy their timing constraints and the WCET is a key parameter for the design of embedded systems. In this paper, we propose a new WCET-driven cache-aware memory content selection algorithm, which allocates functions whose WCET highly benefits from a cached execution to cached memory areas. Vice versa, rarely used functions which do not benefit from a cached execution are allocated to non-cached memory areas. As a result of this, unfavorable functions w.\,r.\,t. a program's WCET can not evict beneficial functions from the cache. This can lead to a reduced cache miss ratio and a decreased WCET. The effectiveness of our approach is demonstrated by results achieved on real-life benchmarks. In a case study, our greedy algorithm is able to reduce the benchmarks' WCET by up to 20\%.},
} Caches are widely used to bridge the increasingly growing gap between processor and memory performance. They store copies of frequently used parts of the slow main memory for faster access. Static analysis techniques allow the estimation of the worst-case cache behavior and enable the computation of an upper bound of the execution time of a program. This bound is called worst-case execution time (WCET). Its knowledge is crucial to verify if hard real-time systems satisfy their timing constraints and the WCET is a key parameter for the design of embedded systems. In this paper, we propose a new WCET-driven cache-aware memory content selection algorithm, which allocates functions whose WCET highly benefits from a cached execution to cached memory areas. Vice versa, rarely used functions which do not benefit from a cached execution are allocated to non-cached memory areas. As a result of this, unfavorable functions w. r. t. a program's WCET can not evict beneficial functions from the cache. This can lead to a reduced cache miss ratio and a decreased WCET. The effectiveness of our approach is demonstrated by results achieved on real-life benchmarks. In a case study, our greedy algorithm is able to reduce the benchmarks' WCET by up to 20%.
|
| Frank Weichert, Marcel Gaspar, Alexander Zybin, Evgeny Gurevich, Alexander Görtz, Constantin Timm, Heinrich Müller and Peter Marwedel. Plasmonen-unterstützte Mikroskopie zur Detektion von Viren. In Bildverarbeitung für die Medizin Aachen / Germany, March 2010 [BibTeX][Abstract]@inproceedings { weichert:10:bvm,
author = {Weichert, Frank and Gaspar, Marcel and Zybin, Alexander and Gurevich, Evgeny and G\"ortz, Alexander and Timm, Constantin and M\"uller, Heinrich and Marwedel, Peter},
title = {Plasmonen-unterst\"utzte Mikroskopie zur Detektion von Viren},
booktitle = {Bildverarbeitung f\"ur die Medizin},
year = {2010},
address = {Aachen / Germany},
month = {March},
confidential = {n},
abstract = {In Anbetracht zunehmend epidemisch auftretender viraler Infektionen ist eine effiziente und ubiquit\"ar verf\"ugbare Methode zur sicheren Virusdetektion hoch relevant. Mit der Plasmonen-unterst\"utzten Mikroskopie steht hierzu eine neuartige Untersuchungsmethode bereit, die aber gro\"se Anforderungen an die Bildverarbeitung zur Differenzierung der Viren innerhalb der Bilddaten stellt. In dieser Arbeit wird hierzu ein erster erfolgversprechender Ansatz vorgestellt. \"Uber bildbasierte Mustererkennung und Zeitreihenanalysen in Kombination mit Klassifikationsverfahren konnte sowohl die Differenzierung von Nanoobjekten als auch die Detektion von Virus-\"ahnlichen Partikeln nachgewiesen werden.},
} In Anbetracht zunehmend epidemisch auftretender viraler Infektionen ist eine effiziente und ubiquitär verfügbare Methode zur sicheren Virusdetektion hoch relevant. Mit der Plasmonen-unterstützten Mikroskopie steht hierzu eine neuartige Untersuchungsmethode bereit, die aber große Anforderungen an die Bildverarbeitung zur Differenzierung der Viren innerhalb der Bilddaten stellt. In dieser Arbeit wird hierzu ein erster erfolgversprechender Ansatz vorgestellt. \"Uber bildbasierte Mustererkennung und Zeitreihenanalysen in Kombination mit Klassifikationsverfahren konnte sowohl die Differenzierung von Nanoobjekten als auch die Detektion von Virus-ähnlichen Partikeln nachgewiesen werden.
|
| Andreas Heinig, Michael Engel, Florian Schmoll and Peter Marwedel. Using Application Knowledge to Improve Embedded Systems Dependability. In Proceedings of the Workshop on Hot Topics in System Dependability (HotDep 2010) Vancouver, Canada, October 2010 [BibTeX][PDF]@inproceedings { heinig:10:hotdep,
author = {Heinig, Andreas and Engel, Michael and Schmoll, Florian and Marwedel, Peter},
title = {Using Application Knowledge to Improve Embedded Systems Dependability},
booktitle = {Proceedings of the Workshop on Hot Topics in System Dependability (HotDep 2010)},
year = {2010},
address = {Vancouver, Canada},
month = {oct},
publisher = {USENIX Association},
keywords = {ders},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2010-heinig-hotdep.pdf},
confidential = {n},
} |
| Andreas Heinig, Michael Engel, Florian Schmoll and Peter Marwedel. Improving Transient Memory Fault Resilience of an H.264 Decoder. In Proceedings of the Workshop on Embedded Systems for Real-time Multimedia (ESTIMedia 2010) Scottsdale, AZ, USA, October 2010 [BibTeX][PDF]@inproceedings { heinig:10:estimedia,
author = {Heinig, Andreas and Engel, Michael and Schmoll, Florian and Marwedel, Peter},
title = {Improving Transient Memory Fault Resilience of an H.264 Decoder},
booktitle = {Proceedings of the Workshop on Embedded Systems for Real-time Multimedia (ESTIMedia 2010)},
year = {2010},
address = {Scottsdale, AZ, USA},
month = {oct},
publisher = {IEEE Computer Society Press},
keywords = {ders},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2010-heinig-estimedia.pdf},
confidential = {n},
} |
| Paul Lokuciejewski, Timon Kelter and Peter Marwedel. Superblock-Based Source Code Optimizations for WCET Reduction. In Proceedings of the 7th International Conference on Embedded Software and Systems (ICESS), pages 1918-1925 Bradford / UK, June 2010 [BibTeX][PDF][Abstract]@inproceedings { lokuciejewski:10:icess,
author = {Lokuciejewski, Paul and Kelter, Timon and Marwedel, Peter},
title = {Superblock-Based Source Code Optimizations for WCET Reduction},
booktitle = {Proceedings of the 7th International Conference on Embedded Software and Systems (ICESS)},
year = {2010},
pages = {1918-1925},
address = {Bradford / UK},
month = {jun},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2010-icess.pdf},
confidential = {n},
abstract = {Superblocks represent regions in a program code that consist of multiple basic blocks. Compilers benefit from this structure since it enables optimization across block boundaries. This increased optimization potential was thoroughly studied in the past for average-case execution time (ACET) reduction at assembly level. In this paper, the concept of superblocks is exploited for the optimization of embedded real-time systems that have to meet stringent timing constraints specified by the worst-case execution time (WCET). To achieve this goal, our superblock formation is based on a novel trace selection algorithm which is driven by WCET data. Moreover, we translate superblocks for the first time from assembly to source code level. This approach enables an early code restructuring in the optimizer, providing more optimization opportunities for both subsequent source code and assembly level transformations. An adaption of the traditional optimizations common subexpression and dead code elimination to our WCET-aware superblocks allows an effective WCET reduction. Using our techniques, we significantly outperform standard optimizations and achieve an average WCET reduction of up to 10.2\% for a total of 55 real-life benchmarks.},
} Superblocks represent regions in a program code that consist of multiple basic blocks. Compilers benefit from this structure since it enables optimization across block boundaries. This increased optimization potential was thoroughly studied in the past for average-case execution time (ACET) reduction at assembly level. In this paper, the concept of superblocks is exploited for the optimization of embedded real-time systems that have to meet stringent timing constraints specified by the worst-case execution time (WCET). To achieve this goal, our superblock formation is based on a novel trace selection algorithm which is driven by WCET data. Moreover, we translate superblocks for the first time from assembly to source code level. This approach enables an early code restructuring in the optimizer, providing more optimization opportunities for both subsequent source code and assembly level transformations. An adaption of the traditional optimizations common subexpression and dead code elimination to our WCET-aware superblocks allows an effective WCET reduction. Using our techniques, we significantly outperform standard optimizations and achieve an average WCET reduction of up to 10.2% for a total of 55 real-life benchmarks.
|
| Paul Lokuciejewski, Sascha Plazar, Heiko Falk, Peter Marwedel and Lothar Thiele. Multi-Objective Exploration of Compiler Optimizations for Real-Time Systems. In Proceedings of the 13th International Symposium on Object/Component/Service-oriented Real-time Distributed Computing (ISORC), pages 115-122 Carmona / Spain, May 2010 [BibTeX][PDF][Abstract]@inproceedings { lokuciejewski:10:isorc,
author = {Lokuciejewski, Paul and Plazar, Sascha and Falk, Heiko and Marwedel, Peter and Thiele, Lothar},
title = {Multi-Objective Exploration of Compiler Optimizations for Real-Time Systems},
booktitle = {Proceedings of the 13th International Symposium on Object/Component/Service-oriented Real-time Distributed Computing (ISORC)},
year = {2010},
pages = {115-122},
address = {Carmona / Spain},
month = {may},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2010-isorc_2.pdf},
confidential = {n},
abstract = {With the growing complexity of embedded systems software, high code quality can only be achieved using a compiler. Sophisticated compilers provide a vast spectrum of various optimizations to improve code aggressively w.r.t. different objective functions, e.g., average-case execution time \textit{(ACET)} or code size. Due to the complex interactions between the optimizations, the choice for a promising sequence of code transformations is not trivial. Compiler developers address this problem by proposing standard optimization levels, e.g., \textit{O3} or \textit{Os}. However, previous studies have shown that these standard levels often miss optimization potential or might even result in performance degradation. In this paper, we propose the first adaptive WCET-aware compiler framework for an automatic search of compiler optimization sequences which yield highly optimized code. Besides the objective functions ACET and code size, we consider the worst-case execution time \textit{(WCET)} which is a crucial parameter for real-time systems. To find suitable trade-offs between these objectives, stochastic evolutionary multi-objective algorithms identifying Pareto optimal solutions are exploited. A comparison based on statistical performance assessments is performed which helps to determine the most suitable multi-objective optimizer. The effectiveness of our approach is demonstrated on real-life benchmarks showing that standard optimization levels can be significantly outperformed.},
} With the growing complexity of embedded systems software, high code quality can only be achieved using a compiler. Sophisticated compilers provide a vast spectrum of various optimizations to improve code aggressively w.r.t. different objective functions, e.g., average-case execution time (ACET) or code size. Due to the complex interactions between the optimizations, the choice for a promising sequence of code transformations is not trivial. Compiler developers address this problem by proposing standard optimization levels, e.g., O3 or Os. However, previous studies have shown that these standard levels often miss optimization potential or might even result in performance degradation. In this paper, we propose the first adaptive WCET-aware compiler framework for an automatic search of compiler optimization sequences which yield highly optimized code. Besides the objective functions ACET and code size, we consider the worst-case execution time (WCET) which is a crucial parameter for real-time systems. To find suitable trade-offs between these objectives, stochastic evolutionary multi-objective algorithms identifying Pareto optimal solutions are exploited. A comparison based on statistical performance assessments is performed which helps to determine the most suitable multi-objective optimizer. The effectiveness of our approach is demonstrated on real-life benchmarks showing that standard optimization levels can be significantly outperformed.
|
| Paul Lokuciejewski, Marco Stolpe, Katharina Morik and Peter Marwedel. Automatic Selection of Machine Learning Models for WCET-aware Compiler Heuristic Generation. In Proceedings of the 4th Workshop on Statistical and Machine Learning Approaches to Architectures and Compilation (SMART), pages 3-17 Pisa / Italy, January 2010 [BibTeX][PDF][Abstract]@inproceedings { lokuciejewski:10:smart,
author = {Lokuciejewski, Paul and Stolpe, Marco and Morik, Katharina and Marwedel, Peter},
title = {Automatic Selection of Machine Learning Models for WCET-aware Compiler Heuristic Generation},
booktitle = {Proceedings of the 4th Workshop on Statistical and Machine Learning Approaches to Architectures and Compilation (SMART)},
year = {2010},
pages = {3-17},
address = {Pisa / Italy},
month = {jan},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2010-smart.pdf},
confidential = {n},
abstract = {Machine learning has shown its capabilities for an automatic generation of heuristics used by optimizing compilers. The advantages of these heuristics are that they can be easily adopted to a new environment and in some cases outperform hand-crafted compiler optimizations. However, this approach shifts the effort from manual heuristic tuning to the model selection problem of machine learning - i.e., selecting learning algorithms and their respective parameters - which is a tedious task in its own right. In this paper, we tackle the model selection problem in a systematic way. As our experiments show, the right choice of a learning algorithm and its parameters can significantly affect the quality of the generated heuristics. We present a generic framework integrating machine learning into a compiler to enable an automatic search for the best learning algorithm. To find good settings for the learner parameters within the large search space, optimizations based on evolutionary algorithms are applied. In contrast to the majority of other approaches aiming at a reduction of the average-case execution time (ACET), our goal is the minimization of the worst-case execution time (WCET) which is a key parameter for embedded systems acting as real-time systems. A careful case study on the heuristic generation for the well-known optimization loop invariant code motion shows the challenges and benefits of our methods.},
} Machine learning has shown its capabilities for an automatic generation of heuristics used by optimizing compilers. The advantages of these heuristics are that they can be easily adopted to a new environment and in some cases outperform hand-crafted compiler optimizations. However, this approach shifts the effort from manual heuristic tuning to the model selection problem of machine learning - i.e., selecting learning algorithms and their respective parameters - which is a tedious task in its own right. In this paper, we tackle the model selection problem in a systematic way. As our experiments show, the right choice of a learning algorithm and its parameters can significantly affect the quality of the generated heuristics. We present a generic framework integrating machine learning into a compiler to enable an automatic search for the best learning algorithm. To find good settings for the learner parameters within the large search space, optimizations based on evolutionary algorithms are applied. In contrast to the majority of other approaches aiming at a reduction of the average-case execution time (ACET), our goal is the minimization of the worst-case execution time (WCET) which is a key parameter for embedded systems acting as real-time systems. A careful case study on the heuristic generation for the well-known optimization loop invariant code motion shows the challenges and benefits of our methods.
|
| Constantin Timm, Jens Schmutzler, Peter Marwedel and Christian Wietfeld. Dynamic Web Service Orchestration applied to the Device Profile for Web Services in Hierarchical Networks. In COMSWARE '09: Proceedings of the Fourth International ICST Conference on COMmunication System softWAre and middlewaRE, pages 1 - 6 Dublin, Ireland, 06 2009 [BibTeX][Abstract]@inproceedings { 2009Timm,
author = {Timm, Constantin and Schmutzler, Jens and Marwedel, Peter and Wietfeld, Christian},
title = {Dynamic Web Service Orchestration applied to the Device Profile for Web Services in Hierarchical Networks},
booktitle = {COMSWARE '09: Proceedings of the Fourth International ICST Conference on COMmunication System softWAre and middlewaRE},
year = {2009},
pages = {1 - 6},
address = {Dublin, Ireland},
month = {06},
confidential = {n},
abstract = {Based on the idea of Service Oriented Architectures (SOA), Web Services paved the way for open and flexible interac- tion between heterogeneous systems with a loose coupling between service endpoints. The Device Profile for Web Ser- vices (DPWS) implements a subset of WS-* specifications in order to make the advantages of the Web Service archi- tecture available to a growing embedded systems market. In this paper we are proposing a service orchestration mecha- nism applied to services on top of a DPWS-based middle- ware. The approach is complementary to the rather complex and resource intensive Web Service Business Process Execu- tion Language (WS-BPEL) and focuses on service orchestra- tion on resource constrained devices deployed in hierarchi- cal network topologies. We validate our service orchestra- tion concept through its resource consumption and illustrate its seamless integration into the service development cycle based on the underlying DPWS-compliant middleware.},
} Based on the idea of Service Oriented Architectures (SOA), Web Services paved the way for open and flexible interac- tion between heterogeneous systems with a loose coupling between service endpoints. The Device Profile for Web Ser- vices (DPWS) implements a subset of WS-* specifications in order to make the advantages of the Web Service archi- tecture available to a growing embedded systems market. In this paper we are proposing a service orchestration mecha- nism applied to services on top of a DPWS-based middle- ware. The approach is complementary to the rather complex and resource intensive Web Service Business Process Execu- tion Language (WS-BPEL) and focuses on service orchestra- tion on resource constrained devices deployed in hierarchi- cal network topologies. We validate our service orchestra- tion concept through its resource consumption and illustrate its seamless integration into the service development cycle based on the underlying DPWS-compliant middleware.
|
| Sascha Plazar, Paul Lokuciejewski and Peter Marwedel. WCET-aware Software Based Cache Partitioning for Multi-Task Real-Time Systems. In The 9th International Workshop on Worst-Case Execution Time Analysis (WCET), pages 78-88 Dublin / Ireland, June 2009 [BibTeX][PDF][Abstract]@inproceedings { plazar:09:wcet,
author = {Plazar, Sascha and Lokuciejewski, Paul and Marwedel, Peter},
title = {WCET-aware Software Based Cache Partitioning for Multi-Task Real-Time Systems},
booktitle = {The 9th International Workshop on Worst-Case Execution Time Analysis (WCET)},
year = {2009},
pages = {78-88},
address = {Dublin / Ireland},
month = {jun},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2009-wcet.pdf},
confidential = {n},
abstract = {Caches are a source of unpredictability since it is very difficult to predict if a memory access results in a cache hit or miss. In systems running multiple tasks steered by a preempting scheduler, it is even impossible to determine the cache behavior since interrupt-driven schedulers lead to unknown points of time for context switches. Partitioned caches are already used in multi-task environments to increase the cache hit ratio by avoiding mutual eviction of tasks from the cache. For real-time systems, the upper bound of the execution time is one of the most important metrics, called the Worst-Case Execution Time (WCET). In this paper, we use partitioning of instruction caches as a technique to achieve tighter WCET estimations since tasks can not be evicted from their partition by other tasks. We propose a novel WCET-aware algorithm, which determines the optimal partition size for each task with focus on decreasing the system's WCET for a given set of possible partition sizes. Employing this algorithm, we are able to decrease the WCET depending on the number of tasks in a set by up to 34\%. On average, reductions between 12\% and 19\% can be achieved.},
} Caches are a source of unpredictability since it is very difficult to predict if a memory access results in a cache hit or miss. In systems running multiple tasks steered by a preempting scheduler, it is even impossible to determine the cache behavior since interrupt-driven schedulers lead to unknown points of time for context switches. Partitioned caches are already used in multi-task environments to increase the cache hit ratio by avoiding mutual eviction of tasks from the cache. For real-time systems, the upper bound of the execution time is one of the most important metrics, called the Worst-Case Execution Time (WCET). In this paper, we use partitioning of instruction caches as a technique to achieve tighter WCET estimations since tasks can not be evicted from their partition by other tasks. We propose a novel WCET-aware algorithm, which determines the optimal partition size for each task with focus on decreasing the system's WCET for a given set of possible partition sizes. Employing this algorithm, we are able to decrease the WCET depending on the number of tasks in a set by up to 34%. On average, reductions between 12% and 19% can be achieved.
|
| G. Schuenemann, P. Hartmann, D. Schirmer, P. Towalski, T. Weis, K. Wille and P. Marwedel. An FPGA Based Data Acquisition System for a fast Orbit Feedback at DELTA. In 9th European Workshop on Beam Diagnostics and Instrumentation for Particle Accelerators Basel / Switzerland, May 2009 [BibTeX][PDF][Abstract]@inproceedings { marwedel:09:dipac,
author = {Schuenemann, G. and Hartmann, P. and Schirmer, D. and Towalski, P. and Weis, T. and Wille, K. and Marwedel, P.},
title = {An FPGA Based Data Acquisition System for a fast Orbit Feedback at DELTA},
booktitle = {9th European Workshop on Beam Diagnostics and Instrumentation for Particle Accelerators},
year = {2009},
address = {Basel / Switzerland},
month = {may},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2009-dipac.pdf},
confidential = {n},
abstract = {The demand for beam orbit stability for frequencies up to 1kHz resulted in the need for a fast orbit position data acquisition system at DELTA. The measurement frequency was decided to be 10kHz which results in a good margin for 1kHz corrections. It is based on a Xilinx University Program Virtex-II Pro Development System in conjunction with an inhouse developed Analog-Digital Converter board, featuring two Analog Devices AD974 chips. An inhouse developed software written in VHDL manages measurement and data pre-processing. A communication controller has been adopted from the Diamond Light Source and is used as communication instance. The communication controller is versatile in its application. The data distribution between two or more of the developed measuring systems is possible. This includes data distribution with other systems utilizing the communication controller, e.g. the Libera beam diagnostic system1. To enhance its measuring capabilities one of the two onboard PowerPC cores is running a Linux kernel. A kernel module, capable of receiving the measurement data from the Field Programmable Gateway Array (FPGA) measurement core, was implemented , allowing for advanced data processing and distribution options. The paper presents the design of the system, the used methods and successful results of the first beam measurements.},
} The demand for beam orbit stability for frequencies up to 1kHz resulted in the need for a fast orbit position data acquisition system at DELTA. The measurement frequency was decided to be 10kHz which results in a good margin for 1kHz corrections. It is based on a Xilinx University Program Virtex-II Pro Development System in conjunction with an inhouse developed Analog-Digital Converter board, featuring two Analog Devices AD974 chips. An inhouse developed software written in VHDL manages measurement and data pre-processing. A communication controller has been adopted from the Diamond Light Source and is used as communication instance. The communication controller is versatile in its application. The data distribution between two or more of the developed measuring systems is possible. This includes data distribution with other systems utilizing the communication controller, e.g. the Libera beam diagnostic system1. To enhance its measuring capabilities one of the two onboard PowerPC cores is running a Linux kernel. A kernel module, capable of receiving the measurement data from the Field Programmable Gateway Array (FPGA) measurement core, was implemented , allowing for advanced data processing and distribution options. The paper presents the design of the system, the used methods and successful results of the first beam measurements.
|
| Paul Lokuciejewski, Daniel Cordes, Heiko Falk and Peter Marwedel. A Fast and Precise Static Loop Analysis based on Abstract Interpretation, Program Slicing and Polytope Models. In International Symposium on Code Generation and Optimization (CGO), pages 136-146 Seattle / USA, March 2009 [BibTeX][PDF][Abstract]@inproceedings { lokuciejewski:09:cgo,
author = {Lokuciejewski, Paul and Cordes, Daniel and Falk, Heiko and Marwedel, Peter},
title = {A Fast and Precise Static Loop Analysis based on Abstract Interpretation, Program Slicing and Polytope Models},
booktitle = {International Symposium on Code Generation and Optimization (CGO)},
year = {2009},
pages = {136-146},
address = {Seattle / USA},
month = {mar},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2009-cgo.pdf},
confidential = {n},
abstract = {A static loop analysis is a program analysis computing loop iteration counts. This information is crucial for different fields of applications. In the domain of compilers, the knowledge about loop iterations can be exploited for aggressive loop optimizations like Loop Unrolling. A loop analyzer also provides static information about code execution frequencies which can assist feedback-directed optimizations. Another prominent application is the static worst-case execution time (WCET) analysis which relies on a safe approximation of loop iteration counts. In this paper, we propose a framework for a static loop analysis based on Abstract Interpretation, a theory of a sound approximation of program semantics. To accelerate the analysis, we preprocess the analyzed code using Program Slicing, a technique that removes statements irrelevant for the loop analysis. In addition, we introduce a novel polytope-based loop evaluation that further significantly reduces the analysis time. The efficiency of our loop analyzer is evaluated on a large number of benchmarks. Results show that 99\% of the considered loops could be successfully analyzed in an acceptable amount of time. This study points out that our methodology is best suited for real-world problems.},
} A static loop analysis is a program analysis computing loop iteration counts. This information is crucial for different fields of applications. In the domain of compilers, the knowledge about loop iterations can be exploited for aggressive loop optimizations like Loop Unrolling. A loop analyzer also provides static information about code execution frequencies which can assist feedback-directed optimizations. Another prominent application is the static worst-case execution time (WCET) analysis which relies on a safe approximation of loop iteration counts. In this paper, we propose a framework for a static loop analysis based on Abstract Interpretation, a theory of a sound approximation of program semantics. To accelerate the analysis, we preprocess the analyzed code using Program Slicing, a technique that removes statements irrelevant for the loop analysis. In addition, we introduce a novel polytope-based loop evaluation that further significantly reduces the analysis time. The efficiency of our loop analyzer is evaluated on a large number of benchmarks. Results show that 99% of the considered loops could be successfully analyzed in an acceptable amount of time. This study points out that our methodology is best suited for real-world problems.
|
| Paul Lokuciejewski and Peter Marwedel. Combining Worst-Case Timing Models, Loop Unrolling, and Static Loop Analysis for WCET Minimization. In The 21st Euromicro Conference on Real-Time Systems (ECRTS), pages 35-44 Dublin / Ireland, July 2009 [BibTeX][PDF][Abstract]@inproceedings { lokuciejewski:09:ecrts,
author = {Lokuciejewski, Paul and Marwedel, Peter},
title = {Combining Worst-Case Timing Models, Loop Unrolling, and Static Loop Analysis for WCET Minimization},
booktitle = {The 21st Euromicro Conference on Real-Time Systems (ECRTS)},
year = {2009},
pages = {35-44},
address = {Dublin / Ireland},
month = {jul},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2009-ecrts.pdf},
confidential = {n},
abstract = {Program loops are notorious for their optimization potential on modern high-performance architectures. Compilers aim at their aggressive transformation to achieve large improvements of the program performance. In particular, the optimization loop unrolling has shown in the past decades to be highly effective achieving significant increases of the average-case performance. In this paper, we present loop unrolling that is tailored towards real-time systems. Our novel optimization is driven by worst-case execution time (WCET) information to effectively minimize the program's worst-case behavior. To exploit maximal optimization potential, the determination of a suitable unrolling factor is based on precise loop iteration counts provided by a static loop analysis. In addition, our heuristics avoid adverse effects of unrolling which result from instruction cache overflows and the generation of additional spill code. Results on 45 real-life benchmarks demonstrate that aggressive loop unrolling can yield WCET reductions of up to 13.7\% over simple, naive approaches employed by many production compilers.},
} Program loops are notorious for their optimization potential on modern high-performance architectures. Compilers aim at their aggressive transformation to achieve large improvements of the program performance. In particular, the optimization loop unrolling has shown in the past decades to be highly effective achieving significant increases of the average-case performance. In this paper, we present loop unrolling that is tailored towards real-time systems. Our novel optimization is driven by worst-case execution time (WCET) information to effectively minimize the program's worst-case behavior. To exploit maximal optimization potential, the determination of a suitable unrolling factor is based on precise loop iteration counts provided by a static loop analysis. In addition, our heuristics avoid adverse effects of unrolling which result from instruction cache overflows and the generation of additional spill code. Results on 45 real-life benchmarks demonstrate that aggressive loop unrolling can yield WCET reductions of up to 13.7% over simple, naive approaches employed by many production compilers.
|
| Paul Lokuciejewski, Fatih Gedikli, Peter Marwedel and Katharina Morik. Automatic WCET Reduction by Machine Learning Based Heuristics for Function Inlining. In Proceedings of the 3rd Workshop on Statistical and Machine Learning Approaches to Architectures and Compilation (SMART), pages 1-15 Paphos / Cyprus, January 2009 [BibTeX][PDF][Abstract]@inproceedings { lokuciejewski:09:smart,
author = {Lokuciejewski, Paul and Gedikli, Fatih and Marwedel, Peter and Morik, Katharina},
title = {Automatic WCET Reduction by Machine Learning Based Heuristics for Function Inlining},
booktitle = {Proceedings of the 3rd Workshop on Statistical and Machine Learning Approaches to Architectures and Compilation (SMART)},
year = {2009},
pages = {1-15},
address = {Paphos / Cyprus},
month = {jan},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2009-smart.pdf},
confidential = {n},
abstract = {The application of machine learning techniques in compiler frameworks has become a challenging research area. Learning algorithms are exploited for an automatic generation of optimization heuristics which often outperform hand-crafted models. Moreover, these automatic approaches can effectively tune the compilers' heuristics after larger changes in the optimization sequence or they can be leveraged to tailor heuristics towards a particular architectural model. Previous works focussed on a reduction of the average-case performance. In this paper, learning approaches are studied in the context of an automatic minimization of the worst-case execution time (WCET) which is the upper bound of the program's maximum execution time. We show that explicitly taking the new timing model into account allows the construction of compiler heuristics that effectively reduce the WCET. This is demonstrated for the well-known optimization function inlining. Our WCET-driven inlining heuristics based on a fast classifier called random forests outperform standard heuristics by up to 9.1% on average in terms of the WCET reduction. Moreover, we point out that our classifier is highly accurate with a prediction rate for inlining candidates of 84.0%.},
} The application of machine learning techniques in compiler frameworks has become a challenging research area. Learning algorithms are exploited for an automatic generation of optimization heuristics which often outperform hand-crafted models. Moreover, these automatic approaches can effectively tune the compilers' heuristics after larger changes in the optimization sequence or they can be leveraged to tailor heuristics towards a particular architectural model. Previous works focussed on a reduction of the average-case performance. In this paper, learning approaches are studied in the context of an automatic minimization of the worst-case execution time (WCET) which is the upper bound of the program's maximum execution time. We show that explicitly taking the new timing model into account allows the construction of compiler heuristics that effectively reduce the WCET. This is demonstrated for the well-known optimization function inlining. Our WCET-driven inlining heuristics based on a fast classifier called random forests outperform standard heuristics by up to 9.1% on average in terms of the WCET reduction. Moreover, we point out that our classifier is highly accurate with a prediction rate for inlining candidates of 84.0%.
|
| Paul Lokuciejewski, Fatih Gedikli and Peter Marwedel. Accelerating WCET-driven Optimizations by the Invariant Path Paradigm - a Case Study of Loop Unswitching. In The 12th International Workshop on Software & Compilers for Embedded Systems (SCOPES), pages 11-20 Nice / France, April 2009 [BibTeX][PDF][Abstract]@inproceedings { lokuciejewski:09:scopes,
author = {Lokuciejewski, Paul and Gedikli, Fatih and Marwedel, Peter},
title = {Accelerating WCET-driven Optimizations by the Invariant Path Paradigm - a Case Study of Loop Unswitching},
booktitle = {The 12th International Workshop on Software \& Compilers for Embedded Systems (SCOPES)},
year = {2009},
pages = {11-20},
address = {Nice / France},
month = {apr},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2009-scopes.pdf},
confidential = {n},
abstract = {The worst-case execution time (WCET) being the upper bound of the maximum execution time corresponds to the longest path through the program's control flow graph. Its reduction is the objective of a WCET optimization. Unlike average-case execution time compiler optimizations which consider a static (most frequently executed) path, the longest path is variable since its optimization might result in another path becoming the effective longest path. To keep path information valid, WCET optimizations typically perform a time-consuming static WCET analysis after each code modification to ensure that subsequent optimization steps operate on the critical path. However, a code modification does not always lead to a path switch, making many WCET analyses superfluous. To cope with this problem, we propose a new paradigm called Invariant Path which eliminates the pessimism by indicating whether a path update is mandatory. To demonstrate the paradigm's practical use, we developed a novel optimization called WCET-driven Loop Unswitching which exploits the Invariant Path information. In a case study, our optimization reduced the WCET of real-world benchmarks by up to 18.3\%, while exploiting the Invariant Path paradigm led to a reduction of the optimization time by 57.5\% on average.},
} The worst-case execution time (WCET) being the upper bound of the maximum execution time corresponds to the longest path through the program's control flow graph. Its reduction is the objective of a WCET optimization. Unlike average-case execution time compiler optimizations which consider a static (most frequently executed) path, the longest path is variable since its optimization might result in another path becoming the effective longest path. To keep path information valid, WCET optimizations typically perform a time-consuming static WCET analysis after each code modification to ensure that subsequent optimization steps operate on the critical path. However, a code modification does not always lead to a path switch, making many WCET analyses superfluous. To cope with this problem, we propose a new paradigm called Invariant Path which eliminates the pessimism by indicating whether a path update is mandatory. To demonstrate the paradigm's practical use, we developed a novel optimization called WCET-driven Loop Unswitching which exploits the Invariant Path information. In a case study, our optimization reduced the WCET of real-world benchmarks by up to 18.3%, while exploiting the Invariant Path paradigm led to a reduction of the optimization time by 57.5% on average.
|
| Paul Lokuciejewski, Heiko Falk and Peter Marwedel. WCET-driven Cache-based Procedure Positioning Optimizations. In The 20th Euromicro Conference on Real-Time Systems (ECRTS), pages 321-330 Prague / Czech Republic, July 2008 [BibTeX][PDF][Abstract]@inproceedings { loku:08:ecrts,
author = {Lokuciejewski, Paul and Falk, Heiko and Marwedel, Peter},
title = {WCET-driven Cache-based Procedure Positioning Optimizations},
booktitle = {The 20th Euromicro Conference on Real-Time Systems (ECRTS)},
year = {2008},
pages = {321-330},
address = {Prague / Czech Republic},
month = {jul},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2008-ecrts.pdf},
confidential = {n},
abstract = {Procedure Positioning is a well known compiler optimization aiming at the improvement of the instruction cache behavior. A contiguous mapping of procedures calling each other frequently in the memory avoids overlapping of cache lines and thus decreases the number of cache conflict misses. In standard literature, these positioning techniques are guided by execution profile data and focus on an improved average-case performance. We present two novel positioning optimizations driven by worst-case execution time (WCET) information to effectively minimize the program's worst-case behavior. WCET reductions by 10\% on average are achieved. Moreover, a combination of positioning and the WCET-driven Procedure Cloning optimization is presented improving the WCET analysis by 36\% on average.},
} Procedure Positioning is a well known compiler optimization aiming at the improvement of the instruction cache behavior. A contiguous mapping of procedures calling each other frequently in the memory avoids overlapping of cache lines and thus decreases the number of cache conflict misses. In standard literature, these positioning techniques are guided by execution profile data and focus on an improved average-case performance. We present two novel positioning optimizations driven by worst-case execution time (WCET) information to effectively minimize the program's worst-case behavior. WCET reductions by 10% on average are achieved. Moreover, a combination of positioning and the WCET-driven Procedure Cloning optimization is presented improving the WCET analysis by 36% on average.
|
| Paul Lokuciejewski, Heiko Falk, Peter Marwedel and Henrik Theiling. WCET-Driven, Code-Size Critical Procedure Cloning. In The 11th International Workshop on Software & Compilers for Embedded Systems (SCOPES), pages 21-30 Munich / Germany, March 2008 [BibTeX][PDF][Abstract]@inproceedings { loku:08:scopes,
author = {Lokuciejewski, Paul and Falk, Heiko and Marwedel, Peter and Theiling, Henrik},
title = {WCET-Driven, Code-Size Critical Procedure Cloning},
booktitle = {The 11th International Workshop on Software \& Compilers for Embedded Systems (SCOPES)},
year = {2008},
pages = {21-30},
address = {Munich / Germany},
month = {mar},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2008-scopes.pdf},
confidential = {n},
abstract = {In the domain of the worst-case execution time (WCET) analysis, loops are an inherent source of unpredictability and loss of precision since the determination of tight and safe information on the number of loop iterations is a diffi- cult task. In particular, data-dependent loops whose itera- tion counts depend on function parameters can not be pre- cisely handled by a timing analysis. Procedure Cloning can be exploited to make these loops explicit within the source code allowing a highly precise WCET analysis. In this paper we extend the standard Procedure Cloning optimization by WCET-aware concepts with the objective to improve the tightness of the WCET estimation. Our novel approach is driven by WCET information which succes- sively eliminates code structures leading to overestimated timing results, thus making the code more suitable for the analysis. In addition, the code size increase during the op- timization is monitored and large increases are avoided. The effectiveness of our optimization is shown by tests on real-world benchmarks. After performing our optimiza- tion, the estimated WCET is reduced by up to 64.2\% while the employed code transformations yield an additional code size increase of 22.6\% on average. In contrast, the average- case performance being the original objective of Procedure Cloning showed a slight decrease.},
} In the domain of the worst-case execution time (WCET) analysis, loops are an inherent source of unpredictability and loss of precision since the determination of tight and safe information on the number of loop iterations is a diffi- cult task. In particular, data-dependent loops whose itera- tion counts depend on function parameters can not be pre- cisely handled by a timing analysis. Procedure Cloning can be exploited to make these loops explicit within the source code allowing a highly precise WCET analysis. In this paper we extend the standard Procedure Cloning optimization by WCET-aware concepts with the objective to improve the tightness of the WCET estimation. Our novel approach is driven by WCET information which succes- sively eliminates code structures leading to overestimated timing results, thus making the code more suitable for the analysis. In addition, the code size increase during the op- timization is monitored and large increases are avoided. The effectiveness of our optimization is shown by tests on real-world benchmarks. After performing our optimiza- tion, the estimated WCET is reduced by up to 64.2% while the employed code transformations yield an additional code size increase of 22.6% on average. In contrast, the average- case performance being the original objective of Procedure Cloning showed a slight decrease.
|
| Sascha Plazar, Paul Lokuciejewski and Peter Marwedel. A Retargetable Framework for Multi-objective WCET-aware High-level Compiler Optimizations. In Proceedings of The 29th IEEE Real-Time Systems Symposium (RTSS) WiP, pages 49-52 Barcelona / Spain, December 2008 [BibTeX][PDF][Abstract]@inproceedings { plazar:08:rtss,
author = {Plazar, Sascha and Lokuciejewski, Paul and Marwedel, Peter},
title = {A Retargetable Framework for Multi-objective WCET-aware High-level Compiler Optimizations},
booktitle = {Proceedings of The 29th IEEE Real-Time Systems Symposium (RTSS) WiP},
year = {2008},
pages = {49-52},
address = {Barcelona / Spain},
month = {dec},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2008-rtss.pdf},
confidential = {n},
abstract = {The worst-case execution time (WCET) is a key parameter in the domain of real-time systems and its automatic compiler-based minimization becomes a challenging research area. Although today's embedded system applications are written in a high-level language, most published works consider low-level optimizations which complicate their portability to other processors. In this work, we present a framework for the development of novel WCETdriven high-level optimizations. Our WCET-aware compiler framework provides a multi-target support as well as an integration of different non-functional objectives. It enables multi-objective optimizations, thus opens avenues to a state-of-the-art design of predictable and efficient systems. In addition, the multi-target support provides the opportunity to efficiently evaluate the impact of different compiler optimizations on various processors.},
} The worst-case execution time (WCET) is a key parameter in the domain of real-time systems and its automatic compiler-based minimization becomes a challenging research area. Although today's embedded system applications are written in a high-level language, most published works consider low-level optimizations which complicate their portability to other processors. In this work, we present a framework for the development of novel WCETdriven high-level optimizations. Our WCET-aware compiler framework provides a multi-target support as well as an integration of different non-functional objectives. It enables multi-objective optimizations, thus opens avenues to a state-of-the-art design of predictable and efficient systems. In addition, the multi-target support provides the opportunity to efficiently evaluate the impact of different compiler optimizations on various processors.
|
| Peter Marwedel and Heiko Falk (presentation). Memory-architecture aware compilation. In The ARTIST2 Summer School 2008 in Europe Autrans / France, 2008 [BibTeX][PDF]@inproceedings { marwedel:08:artist2,
author = {Marwedel, Peter and Falk (presentation), Heiko},
title = {Memory-architecture aware compilation},
booktitle = {The ARTIST2 Summer School 2008 in Europe},
year = {2008},
address = {Autrans / France},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2008-artist2summerschool.pdf},
confidential = {n},
} |
| Paul Lokuciejewski, Heiko Falk, Martin Schwarzer and Peter Marwedel. Tighter WCET Estimates by Procedure Cloning. In 7th International Workshop on Worst-Case Execution Time Analysis (WCET), pages 27-32 Pisa/Italy, July 2007 [BibTeX][PDF][Abstract]@inproceedings { loku:07:wcet,
author = {Lokuciejewski, Paul and Falk, Heiko and Schwarzer, Martin and Marwedel, Peter},
title = {Tighter WCET Estimates by Procedure Cloning},
booktitle = {7th International Workshop on Worst-Case Execution Time Analysis (WCET)},
year = {2007},
pages = {27-32},
address = {Pisa/Italy},
month = {jul},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2007-wcet.pdf},
confidential = {n},
abstract = {Embedded software spends most of its execution time in loops. To allow a precise static WCET analysis, each loop iteration should, in theory, be represented by an individual calling context. However, due to the enormous analysis times of real-world applications, this approach is not feasible and requires a reduction of the analysis complexity by limiting the number of considered contexts. This restricted timing analysis results in imprecise WCET estimates. In particular, data-dependent loops with iteration counts depending on function parameters cannot be precisely analyzed. In order to reduce the number of contexts that must be implicitly considered, causing an increase in analysis time, we apply the standard compiler optimization \textem{procedure cloning} which improves the program's predictability by making loops explicit and thus allowing a precise annotation of loop bounds. The result is a tight WCET estimation within a reduced analysis time. Our results indicate that reductions of the WCET between 12\% and 95\% were achieved for real-world benchmarks. In contrast, the reduction of the simulated program execution time remained marginal with only 3\%. As will be also shown, this optimization only produces a small overhead for the WCET analysis.},
} Embedded software spends most of its execution time in loops. To allow a precise static WCET analysis, each loop iteration should, in theory, be represented by an individual calling context. However, due to the enormous analysis times of real-world applications, this approach is not feasible and requires a reduction of the analysis complexity by limiting the number of considered contexts. This restricted timing analysis results in imprecise WCET estimates. In particular, data-dependent loops with iteration counts depending on function parameters cannot be precisely analyzed. In order to reduce the number of contexts that must be implicitly considered, causing an increase in analysis time, we apply the standard compiler optimization procedure cloning which improves the program's predictability by making loops explicit and thus allowing a precise annotation of loop bounds. The result is a tight WCET estimation within a reduced analysis time. Our results indicate that reductions of the WCET between 12% and 95% were achieved for real-world benchmarks. In contrast, the reduction of the simulated program execution time remained marginal with only 3%. As will be also shown, this optimization only produces a small overhead for the WCET analysis.
|
| Robert Pyka, Christoph Faßbach, Manish Verma, Heiko Falk and Peter Marwedel. Operating system integrated energy aware scratchpad allocation strategies for multiprocess applications. In 10th International Workshop on Software & Compilers for Embedded Systems (SCOPES), pages 41-50 Nice/France, April 2007 [BibTeX][PDF][Abstract]@inproceedings { pyka:07:scopes,
author = {Pyka, Robert and Fa\"sbach, Christoph and Verma, Manish and Falk, Heiko and Marwedel, Peter},
title = {Operating system integrated energy aware scratchpad allocation strategies for multiprocess applications},
booktitle = {10th International Workshop on Software \& Compilers for Embedded Systems (SCOPES)},
year = {2007},
pages = {41-50},
address = {Nice/France},
month = {apr},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2007-scopes.pdf},
confidential = {n},
abstract = {Various scratchpad allocation strategies have been developed in the past. Most of them target the reduction of energy consumption. These approaches share the necessity of having direct access to the scratchpad memory. In earlier embedded systems this was always true, but with the increasing complexity of tasks systems have to perform, an additional operating system layer between the hardware and the application is becoming mandatory. This paper presents an approach to integrate a scratchpad memory manager into the operating system. The goal is to minimize energy consumption. In contrast to previous work, compile time knowledge about the application's behavior is taken into account. A set of fast heuristic allocation methods is proposed in this paper. An in-depth study and comparison of achieved energy savings and cycle reductions was performed. The results show that even in the highly dynamic environment of an operating system equipped embedded system, up to 83% energy consumption reduction can be achieved.},
} Various scratchpad allocation strategies have been developed in the past. Most of them target the reduction of energy consumption. These approaches share the necessity of having direct access to the scratchpad memory. In earlier embedded systems this was always true, but with the increasing complexity of tasks systems have to perform, an additional operating system layer between the hardware and the application is becoming mandatory. This paper presents an approach to integrate a scratchpad memory manager into the operating system. The goal is to minimize energy consumption. In contrast to previous work, compile time knowledge about the application's behavior is taken into account. A set of fast heuristic allocation methods is proposed in this paper. An in-depth study and comparison of achieved energy savings and cycle reductions was performed. The results show that even in the highly dynamic environment of an operating system equipped embedded system, up to 83% energy consumption reduction can be achieved.
|
| Paul Lokuciejewski, Heiko Falk, Martin Schwarzer, Peter Marwedel and Henrik Theiling. Influence of Procedure Cloning on WCET Prediction. In International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS), pages 137-142 Salzburg/Austria, September 2007 [BibTeX][PDF][Abstract]@inproceedings { loku:07:codes_isss,
author = {Lokuciejewski, Paul and Falk, Heiko and Schwarzer, Martin and Marwedel, Peter and Theiling, Henrik},
title = {Influence of Procedure Cloning on WCET Prediction},
booktitle = {International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS)},
year = {2007},
pages = {137-142},
address = {Salzburg/Austria},
month = {sep},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2007-codes+isss_2.pdf},
confidential = {n},
abstract = {For the worst-case execution time \textem{(WCET)} analysis, especially loops are an inherent source of unpredictability and loss of precision. This is caused by the difficulty to obtain safe and tight information on the number of iterations executed by a loop in the worst case. In particular, data-dependent loops whose iteration counts depend on function parameters are extremely difficult to analyze precisely. Procedure cloning helps by making such data-dependent loops explicit within the source code, thus making them accessible for high-precision WCET analyses. This paper presents the effect of procedure cloning applied at the source-code level on worst-case execution time. The optimization generates specialized versions of functions being called with constant values as arguments. In standard literature, it is used to enable further optimizations like constant propagation within functions and to reduce calling overhead. We show that procedure cloning for WCET minimization leads to significant improvements. Reductions of the WCET from 12\% up to 95\% were measured for real-life benchmarks. These results demonstrate that procedure cloning improves analyzability and predictability of real-time applications dramatically. In contrast, average-case performance as the criterion procedure cloning was developed for is reduced by only 3\% at most. Our results also show that these WCET reductions only implied small overhead during WCET analysis.},
} For the worst-case execution time (WCET) analysis, especially loops are an inherent source of unpredictability and loss of precision. This is caused by the difficulty to obtain safe and tight information on the number of iterations executed by a loop in the worst case. In particular, data-dependent loops whose iteration counts depend on function parameters are extremely difficult to analyze precisely. Procedure cloning helps by making such data-dependent loops explicit within the source code, thus making them accessible for high-precision WCET analyses. This paper presents the effect of procedure cloning applied at the source-code level on worst-case execution time. The optimization generates specialized versions of functions being called with constant values as arguments. In standard literature, it is used to enable further optimizations like constant propagation within functions and to reduce calling overhead. We show that procedure cloning for WCET minimization leads to significant improvements. Reductions of the WCET from 12% up to 95% were measured for real-life benchmarks. These results demonstrate that procedure cloning improves analyzability and predictability of real-time applications dramatically. In contrast, average-case performance as the criterion procedure cloning was developed for is reduced by only 3% at most. Our results also show that these WCET reductions only implied small overhead during WCET analysis.
|
| Peter Marwedel, Heiko Falk, Sascha Plazar, Robert Pyka and Lars Wehmeyer. Automatic mapping to tightly-coupled memories and cache locking. In Proceedings of 4th HiPEAC Industrial Workshop on Compilers and Architectures Cambridge, UK, August 2007 [BibTeX][PDF][Link]@inproceedings { marwedel:07:hipeac,
author = {Marwedel, Peter and Falk, Heiko and Plazar, Sascha and Pyka, Robert and Lars Wehmeyer,},
title = {Automatic mapping to tightly-coupled memories and cache locking},
booktitle = {Proceedings of 4th HiPEAC Industrial Workshop on Compilers and Architectures},
year = {2007},
address = {Cambridge, UK},
month = {aug},
url = {http://www.hipeac.net/industry_workshop4},
keywords = {wcet},
file = {http://www.hipeac.net/system/files?file=session1_3.ppt},
confidential = {n},
} |
| Manish Verma and Peter Marwedel. Compilation and Simulation Tool Chain for Memory Aware Energy Optimizations. In Workshop on Embedded Computer Systems: Architectures, Modeling, and Simulation (SAMOS VI) Samos, Greece, July 2006 [BibTeX][PDF][Abstract]@inproceedings { verma:06:samos,
author = {Verma, Manish and Marwedel, Peter},
title = {Compilation and Simulation Tool Chain for Memory Aware Energy Optimizations},
booktitle = {Workshop on Embedded Computer Systems: Architectures, Modeling, and Simulation (SAMOS VI)},
year = {2006},
address = {Samos, Greece},
month = {jul},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2006-samos.pdf},
confidential = {n},
abstract = {Memory hierarchies are known to be the energy bottleneck of portable embedded devices. Numerous memory aware energy optimizations have been proposed. However, both the optimization and the validation is performed in an ad-hoc manner as a coherent compilation and simulation framework does not exist as yet. In this paper, we present such a framework for performing memory hierarchy aware energy optimization. Both the compiler and the simulator are configured from a single memory hierarchy description. Significant savings of up to 50\% in the total energy dissipation are reported.},
} Memory hierarchies are known to be the energy bottleneck of portable embedded devices. Numerous memory aware energy optimizations have been proposed. However, both the optimization and the validation is performed in an ad-hoc manner as a coherent compilation and simulation framework does not exist as yet. In this paper, we present such a framework for performing memory hierarchy aware energy optimization. Both the compiler and the simulator are configured from a single memory hierarchy description. Significant savings of up to 50% in the total energy dissipation are reported.
|
| Lars Wehmeyer and Peter Marwedel. Influence of Memory Hierarchies on Predictability for Time Constrained Embedded Software. In Design Automation and Test in Europe (DATE) Munich, Germany, March 2005 [BibTeX][PDF][Abstract]@inproceedings { wehm:05:date,
author = {Wehmeyer, Lars and Marwedel, Peter},
title = {Influence of Memory Hierarchies on Predictability for Time Constrained Embedded Software},
booktitle = {Design Automation and Test in Europe (DATE)},
year = {2005},
address = {Munich, Germany},
month = {mar},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2005-date.pdf},
confidential = {n},
abstract = {Safety-critical embedded systems having to meet real-time constraints are expected to be highly predictable in order to guarantee at design time that certain timing deadlines will always be met. This requirement usually prevents designers from utilizing caches due to their highly dynamic, thus hardly predictable behavior. The integration of scratchpad memories represents an alternative approach which allows the system to benefit from a performance gain comparable to that of caches while at the same time maintaining predictability. In this work, we compare the impact of scratchpad memories and caches on worst case execution time (WCET) analysis results. We show that caches, despite requiring complex techniques, can have a negative impact on the predicted WCET, while the estimated WCET for scratchpad memories scales with the achieved performance gain at no extra analysis cost.},
} Safety-critical embedded systems having to meet real-time constraints are expected to be highly predictable in order to guarantee at design time that certain timing deadlines will always be met. This requirement usually prevents designers from utilizing caches due to their highly dynamic, thus hardly predictable behavior. The integration of scratchpad memories represents an alternative approach which allows the system to benefit from a performance gain comparable to that of caches while at the same time maintaining predictability. In this work, we compare the impact of scratchpad memories and caches on worst case execution time (WCET) analysis results. We show that caches, despite requiring complex techniques, can have a negative impact on the predicted WCET, while the estimated WCET for scratchpad memories scales with the achieved performance gain at no extra analysis cost.
|
| Peter Marwedel, Manish Verma and Lars Wehmeyer. Compiler optimizations improving the processor/memory interface. In Workshop on Optimizing Compiler Assisted SoC Assembly (OCASA) September 2005 [BibTeX]@inproceedings { marw:05:ocasa,
author = {Marwedel, Peter and Verma, Manish and Wehmeyer, Lars},
title = {Compiler optimizations improving the processor/memory interface},
booktitle = {Workshop on Optimizing Compiler Assisted SoC Assembly (OCASA)},
year = {2005},
month = {sep},
confidential = {n},
} |
| Manish Verma and Peter Marwedel. Memory Optimization Techniques for Low-Power Embedded Processors. In IFIP VIVA Workshop - Fundamentals and Methods for Low-Power Information Processing Bonn, Germany, September 2005 [BibTeX][PDF][Abstract]@inproceedings { verma:05:viva,
author = {Verma, Manish and Marwedel, Peter},
title = {Memory Optimization Techniques for Low-Power Embedded Processors},
booktitle = {IFIP VIVA Workshop - Fundamentals and Methods for Low-Power Information Processing},
year = {2005},
address = {Bonn, Germany},
month = {sep},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2005-viva.pdf},
confidential = {n},
abstract = {Power consumption is an important design issue for contemporary portable embedded devices. It is known that the next generation of portable devices will feature faster processors and larger memories, both of which require high operational power. %This is expected to Memory subsystem has already been identified as the energy bottleneck of the entire system. Consequently, memory hierarchies are being constructed to reduce the memory subsystem's energy dissipation. Caches and scratchpad memories represent two contrasting memory architectures. Scratchpads are both area and power efficient than caches. However, they require explicit support from the compiler for managing their contents. In this work, we present three approaches for the prudent utilization of the scratchpad memory of an ARM7 processor and of a M5 DSP based system. The first approach is based on the following observations. Firstly, a small memory requires less energy per access than that by a large memory. Secondly, applications in general consist of small and frequently accessed arrays and large but infrequently accessed arrays. Consequently, the approach partitions the large scratchpad into several small scratchpads. The arrays are also statically mapped such that the small arrays are mapped to small and energy efficient scratchpads. The approach leads to average energy savings of 52\% and 35\% in the data memory subsystem of the ARM7 and the M5 DSP, respectively. The second approach utilizes the scratchpad as an instruction buffer in a cache based memory hierarchy. The approach models the cache as a conflict graph and assigns instructions to the scratchpad. The objective is to minimize the energy consumption of the system while preserving the predictable behavior of the memory hierarchy. The approach results in an average energy saving of 21\% against the above approach for the ARM7 based system. The last approach optimizes the energy consumption of the system by overlaying memory objects (\textem{i.e.} code segments and data elements) on to the scratchpad. Memory objects with non-conflicting life-times are assigned to the same location on the scratchpad. This improves the scratchpad utilization, however, it requires copying memory objects on and off the scratchpad during the execution of the application. Average energy reductions of 34\% and 33\% are reported for the ARM7 and the M5 DSP based systems, respectively.},
} Power consumption is an important design issue for contemporary portable embedded devices. It is known that the next generation of portable devices will feature faster processors and larger memories, both of which require high operational power. %This is expected to Memory subsystem has already been identified as the energy bottleneck of the entire system. Consequently, memory hierarchies are being constructed to reduce the memory subsystem's energy dissipation. Caches and scratchpad memories represent two contrasting memory architectures. Scratchpads are both area and power efficient than caches. However, they require explicit support from the compiler for managing their contents. In this work, we present three approaches for the prudent utilization of the scratchpad memory of an ARM7 processor and of a M5 DSP based system. The first approach is based on the following observations. Firstly, a small memory requires less energy per access than that by a large memory. Secondly, applications in general consist of small and frequently accessed arrays and large but infrequently accessed arrays. Consequently, the approach partitions the large scratchpad into several small scratchpads. The arrays are also statically mapped such that the small arrays are mapped to small and energy efficient scratchpads. The approach leads to average energy savings of 52% and 35% in the data memory subsystem of the ARM7 and the M5 DSP, respectively. The second approach utilizes the scratchpad as an instruction buffer in a cache based memory hierarchy. The approach models the cache as a conflict graph and assigns instructions to the scratchpad. The objective is to minimize the energy consumption of the system while preserving the predictable behavior of the memory hierarchy. The approach results in an average energy saving of 21% against the above approach for the ARM7 based system. The last approach optimizes the energy consumption of the system by overlaying memory objects (i.e. code segments and data elements) on to the scratchpad. Memory objects with non-conflicting life-times are assigned to the same location on the scratchpad. This improves the scratchpad utilization, however, it requires copying memory objects on and off the scratchpad during the execution of the application. Average energy reductions of 34% and 33% are reported for the ARM7 and the M5 DSP based systems, respectively.
|
| Peter Marwedel. Towards laying common grounds for embedded system design education. In Workshop on Embedded Systems Education (WESE) 2005 [BibTeX][PDF][Abstract]@inproceedings { marwedel:05:wese,
author = {Marwedel, Peter},
title = {Towards laying common grounds for embedded system design education},
booktitle = {Workshop on Embedded Systems Education (WESE)},
year = {2005},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2005-wese.pdf},
confidential = {n},
abstract = {In this paper, we propose to introduce a common introductory course for embedded system education. The course puts the different areas of embedded system design into perspective and avoids an early over-specialization. Also, it motivates the students for attending more advanced theoretical courses. The content, the structure and the prerequisites of such a course are outlined. The course requires a basic understanding of computer hardware and software and can typically be taught in the second or third year.},
} In this paper, we propose to introduce a common introductory course for embedded system education. The course puts the different areas of embedded system design into perspective and avoids an early over-specialization. Also, it motivates the students for attending more advanced theoretical courses. The content, the structure and the prerequisites of such a course are outlined. The course requires a basic understanding of computer hardware and software and can typically be taught in the second or third year.
|
| Manish Verma, Klaus Petzold, Lars Wehmeyer, Heiko Falk and Peter Marwedel. Scratchpad Sharing Strategies for Multiprocess Embedded Systems: A First Approach. In IEEE 3rd Workshop on Embedded System for Real-Time Multimedia (ESTIMedia), pages 115-120 Jersey City, USA, September 2005 [BibTeX][PDF][Abstract]@inproceedings { verma:05:estimedia,
author = {Verma, Manish and Petzold, Klaus and Wehmeyer, Lars and Falk, Heiko and Marwedel, Peter},
title = {Scratchpad Sharing Strategies for Multiprocess Embedded Systems: A First Approach},
booktitle = {IEEE 3rd Workshop on Embedded System for Real-Time Multimedia (ESTIMedia)},
year = {2005},
pages = {115-120},
address = {Jersey City, USA},
month = {sep},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2005-estimedia.pdf},
confidential = {n},
abstract = {Portable embedded systems require diligence in managing their energy consumption. Thus, power efficient processors coupled with onchip memories (e.g. caches, scratchpads) are the base of today's portable devices. Scratchpads are more energy efficient than caches but require software support for their utilization. Portable devices' applications consist of multiple processes for different tasks. However, all the previous scratchpad allocation approaches only consider single process applications. In this paper, we propose a set of optimal strategies to reduce the energy consumption of applications by sharing the scratchpad among multiple processes. The strategies assign both code and data elements to the scratchpad and result in average total energy reductions of 9\%-20\% against a published single process approach. Furthermore, the strategies generate Pareto-optimal curves for the applications allowing design time exploration of energy/scratchpad size tradeoffs.},
} Portable embedded systems require diligence in managing their energy consumption. Thus, power efficient processors coupled with onchip memories (e.g. caches, scratchpads) are the base of today's portable devices. Scratchpads are more energy efficient than caches but require software support for their utilization. Portable devices' applications consist of multiple processes for different tasks. However, all the previous scratchpad allocation approaches only consider single process applications. In this paper, we propose a set of optimal strategies to reduce the energy consumption of applications by sharing the scratchpad among multiple processes. The strategies assign both code and data elements to the scratchpad and result in average total energy reductions of 9%-20% against a published single process approach. Furthermore, the strategies generate Pareto-optimal curves for the applications allowing design time exploration of energy/scratchpad size tradeoffs.
|
| Lars Wehmeyer and Peter Marwedel. Influence of Onchip Scratchpad Memories on WCET prediction. In Proceedings of the 4th International Workshop on Worst-Case Execution Time (WCET) Analysis Catania, Sicily, Italy, June 2004 [BibTeX][PDF][Abstract]@inproceedings { wehm:04:wcet,
author = {Wehmeyer, Lars and Marwedel, Peter},
title = {Influence of Onchip Scratchpad Memories on WCET prediction},
booktitle = {Proceedings of the 4th International Workshop on Worst-Case Execution Time (WCET) Analysis},
year = {2004},
address = {Catania, Sicily, Italy},
month = {jun},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2004-WCET.pdf},
confidential = {n},
abstract = {In contrast to standard PCs and many high-performance computer systems, systems that have to meet real-time requirements usually do not feature caches, since caches primarily improve the average case performance, whereas their impact on WCET is generally hard to predict. Especially in embedded systems, scratchpad memories have become popular. Since these small, fast memories can be controlled by the programmer or the compiler, their behavior is perfectly predictable. In this paper, we study for the first time the impact of scratchpad memories on worst case execution time (WCET) prediction. Our results indicate that scratchpads can significantly improve WCET at no extra analysis cost.},
} In contrast to standard PCs and many high-performance computer systems, systems that have to meet real-time requirements usually do not feature caches, since caches primarily improve the average case performance, whereas their impact on WCET is generally hard to predict. Especially in embedded systems, scratchpad memories have become popular. Since these small, fast memories can be controlled by the programmer or the compiler, their behavior is perfectly predictable. In this paper, we study for the first time the impact of scratchpad memories on worst case execution time (WCET) prediction. Our results indicate that scratchpads can significantly improve WCET at no extra analysis cost.
|
| Lars Wehmeyer, Urs Helmig and Peter Marwedel. Compiler-optimized Usage of Partitioned Memories. In Proceedings of the 3rd Workshop on Memory Performance Issues (WMPI2004) Munich, Germany, June 2004 [BibTeX][PDF][Abstract]@inproceedings { wehm:04:wmpi,
author = {Wehmeyer, Lars and Helmig, Urs and Marwedel, Peter},
title = {Compiler-optimized Usage of Partitioned Memories},
booktitle = {Proceedings of the 3rd Workshop on Memory Performance Issues (WMPI2004)},
year = {2004},
address = {Munich, Germany},
month = {jun},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2004-WMPI.pdf},
confidential = {n},
abstract = {In order to meet the requirements concerning both performance and energy consumption in embedded systems, new memory architectures are being introduced. Beside the well-known use of caches in the memory hierarchy, processor cores today also include small onchip memories called scratchpad memories whose usage is not controlled by hardware, but rather by the programmer or the compiler. Techniques for utilization of these scratchpads have been known for some time. Some new processors provide more than one scratchpad, making it necessary to enhance the workflow such that this complex memory architecture can be efficiently utilized. In this work, we present an energy model and an ILP formulation to optimally assign memory objects to different partitions of scratchpad memories at compile time, achieving energy savings of up to 22\% compared to previous approaches.},
} In order to meet the requirements concerning both performance and energy consumption in embedded systems, new memory architectures are being introduced. Beside the well-known use of caches in the memory hierarchy, processor cores today also include small onchip memories called scratchpad memories whose usage is not controlled by hardware, but rather by the programmer or the compiler. Techniques for utilization of these scratchpads have been known for some time. Some new processors provide more than one scratchpad, making it necessary to enhance the workflow such that this complex memory architecture can be efficiently utilized. In this work, we present an energy model and an ILP formulation to optimally assign memory objects to different partitions of scratchpad memories at compile time, achieving energy savings of up to 22% compared to previous approaches.
|
| Manish Verma, Lars Wehmeyer and Peter Marwedel. Cache Aware Scratchpad Allocation. In DATE Paris/France, February 2004 [BibTeX][PDF][Abstract]@inproceedings { verma:04:date,
author = {Verma, Manish and Wehmeyer, Lars and Marwedel, Peter},
title = {Cache Aware Scratchpad Allocation},
booktitle = {DATE},
year = {2004},
address = {Paris/France},
month = {feb},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2004-date.pdf},
confidential = {n},
abstract = {In the context of portable embedded systems, reducing energy is one of the prime objectives. Most high-end embedded microprocessors include onchip instruction and data caches, along with a small energy efficient scratchpad. Previous approaches for utilizing scratchpad did not consider caches and hence fail for the au courant architecture. In the presented work, we use the scratchpad for storing instructions and propose a generic Cache Aware Scratchpad Allocation (CASA) algorithm. We report an average reduction of 8-29\% in instruction memory energy consumption compared to a previously published technique for benchmarks from the Mediabench suite. The scratchpad in the presented architecture is similar to a preloaded loop cache. Comparing the energy consumption of our approach against preloaded loop caches, we report average energy savings of 20-44\%.},
} In the context of portable embedded systems, reducing energy is one of the prime objectives. Most high-end embedded microprocessors include onchip instruction and data caches, along with a small energy efficient scratchpad. Previous approaches for utilizing scratchpad did not consider caches and hence fail for the au courant architecture. In the presented work, we use the scratchpad for storing instructions and propose a generic Cache Aware Scratchpad Allocation (CASA) algorithm. We report an average reduction of 8-29% in instruction memory energy consumption compared to a previously published technique for benchmarks from the Mediabench suite. The scratchpad in the presented architecture is similar to a preloaded loop cache. Comparing the energy consumption of our approach against preloaded loop caches, we report average energy savings of 20-44%.
|
| Markus Lorenz and Peter Marwedel. Phase Coupled Code Generation for DSPs Using a Genetic Algorithm. In DATE, pages 1270-1275 June 2004 [BibTeX][PDF][Abstract]@inproceedings { lorenz:04:date,
author = {Lorenz, Markus and Marwedel, Peter},
title = {Phase Coupled Code Generation for DSPs Using a Genetic Algorithm},
booktitle = {DATE},
year = {2004},
pages = {1270-1275},
month = {jun},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2004-date-lorenz.pdf},
confidential = {n},
abstract = {The growing use of digital signal processors (DSPs) in embedded systems necessitates the use of optimizing compilers supporting special hardware features. Due to the irregular architectures present in todays DSPs there is a need of compilers which are capable of performing a phase coupling of the highly interdependent code generation subtasks and a graph based code selection. In this paper we present a code generator which performs a graph based code selection and a complete phase coupling of code selection, instruction scheduling (including compaction) and register allocation. In addition, our code generator takes into account effects of the subsequent address code generation phase. In order to solve the phase coupling problem and to handle the problem complexity, our code generator is based on a genetic algorithm. Experimental results for several benchmarks and an MP3 application for two DSPs show the effectiveness and the retargetability of our approach. Using the presented techniques, the number of execution cycles is reduced by 51\% on average for the M3-DSP and by 38\% on average for the ADSP2100 compared to standard techniques.},
} The growing use of digital signal processors (DSPs) in embedded systems necessitates the use of optimizing compilers supporting special hardware features. Due to the irregular architectures present in todays DSPs there is a need of compilers which are capable of performing a phase coupling of the highly interdependent code generation subtasks and a graph based code selection. In this paper we present a code generator which performs a graph based code selection and a complete phase coupling of code selection, instruction scheduling (including compaction) and register allocation. In addition, our code generator takes into account effects of the subsequent address code generation phase. In order to solve the phase coupling problem and to handle the problem complexity, our code generator is based on a genetic algorithm. Experimental results for several benchmarks and an MP3 application for two DSPs show the effectiveness and the retargetability of our approach. Using the presented techniques, the number of execution cycles is reduced by 51% on average for the M3-DSP and by 38% on average for the ADSP2100 compared to standard techniques.
|
| Peter Marwedel, Lars Wehmeyer, Manish Verma, Stefan Steinke and Urs Helmig. Fast, predictable and low energy memory references through architecture-aware compilation. In ASPDAC, pages 4-11 January 2004 [BibTeX][PDF][Abstract]@inproceedings { marw:04:aspdac,
author = {Marwedel, Peter and Wehmeyer, Lars and Verma, Manish and Steinke, Stefan and Helmig, Urs},
title = {Fast, predictable and low energy memory references through architecture-aware compilation},
booktitle = {ASPDAC},
year = {2004},
pages = {4-11},
month = {jan},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2004-aspdac-spm.pdf},
confidential = {n},
abstract = {The design of future high-performance embedded systems is hampered by two problems: First, the required hardware needs more energy than is available from batteries. Second, current cache-based approaches for bridging the increasing speed gap between processors and memories cannot guarantee predictable real-time behavior. A contribution to solving both problems is made in this paper which describes a comprehensive set of algorithms that can be applied at design time in order to maximally exploit scratch pad memories (SPMs). We show that both the energy consumption as well as the computed worst case execution time (WCET) can be reduced by up to to 80\% and 48\%, respectively, by establishing a strong link between the memory architecture and the compiler.},
} The design of future high-performance embedded systems is hampered by two problems: First, the required hardware needs more energy than is available from batteries. Second, current cache-based approaches for bridging the increasing speed gap between processors and memories cannot guarantee predictable real-time behavior. A contribution to solving both problems is made in this paper which describes a comprehensive set of algorithms that can be applied at design time in order to maximally exploit scratch pad memories (SPMs). We show that both the energy consumption as well as the computed worst case execution time (WCET) can be reduced by up to to 80% and 48%, respectively, by establishing a strong link between the memory architecture and the compiler.
|
| Manish Verma, Lars Wehmeyer and Peter Marwedel. Dynamic Overlay of Scratchpad Memory for Energy Minimization. In International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS) Stockholm, Sweden, September 2004 [BibTeX][PDF][Abstract]@inproceedings { verma:04:codes,
author = {Verma, Manish and Wehmeyer, Lars and Marwedel, Peter},
title = {Dynamic Overlay of Scratchpad Memory for Energy Minimization},
booktitle = {International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS)},
year = {2004},
address = {Stockholm, Sweden},
month = {sep},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2004-isss.pdf},
confidential = {n},
abstract = {The memory subsystem accounts for a significant portion of the aggregate energy budget of contemporary embedded systems. Moreover, there exists a large potential for optimizing the energy consumption of the memory subsystem. Consequently, novel memories as well as novel algorithms for their efficient utilization are being designed. Scratchpads are known to perform better than caches in terms of power, performance, area and predictability. However, unlike caches they depend upon software allocation techniques for their utilization. In this paper, we present an allocation technique which analyzes the application and inserts instructions to dynamically copy both code segments and variables onto the scratchpad at runtime. We demonstrate that the problem of dynamically overlaying scratchpad is an extension of the Global Register Allocation problem. The overlay problem is solved optimally using ILP formulation techniques. Our approach improves upon the only previously known allocation technique for statically allocating both variables and code segments onto the scratchpad. Experiments report an average reduction of 34\% and 18\% in the energy consumption and the runtime of the applications, respectively. A minimal increase in code size is also reported.},
} The memory subsystem accounts for a significant portion of the aggregate energy budget of contemporary embedded systems. Moreover, there exists a large potential for optimizing the energy consumption of the memory subsystem. Consequently, novel memories as well as novel algorithms for their efficient utilization are being designed. Scratchpads are known to perform better than caches in terms of power, performance, area and predictability. However, unlike caches they depend upon software allocation techniques for their utilization. In this paper, we present an allocation technique which analyzes the application and inserts instructions to dynamically copy both code segments and variables onto the scratchpad at runtime. We demonstrate that the problem of dynamically overlaying scratchpad is an extension of the Global Register Allocation problem. The overlay problem is solved optimally using ILP formulation techniques. Our approach improves upon the only previously known allocation technique for statically allocating both variables and code segments onto the scratchpad. Experiments report an average reduction of 34% and 18% in the energy consumption and the runtime of the applications, respectively. A minimal increase in code size is also reported.
|
| Markus Lorenz, Peter Marwedel, Thorsten Dräger, Gerhard Fettweis and Rainer Leupers. Compiler based Exploration of DSP Energy Savings by SIMD Operations. In ASPDAC, pages 839-842 June 2004 [BibTeX][PDF][Abstract]@inproceedings { lorenz:04:aspdac,
author = {Lorenz, Markus and Marwedel, Peter and Dr\"ager, Thorsten and Fettweis, Gerhard and Leupers, Rainer},
title = {Compiler based Exploration of DSP Energy Savings by SIMD Operations},
booktitle = {ASPDAC},
year = {2004},
pages = {839-842},
month = {jun},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2004-aspdac-lorenz.pdf},
confidential = {n},
abstract = {The growing use of digital signal processors (DSPs) in embedded systems necessitates the use of optimizing compilers supporting their special architecture features. Besides the irregular DSP architectures for reducing chip size and energy consumption, single instruction multiple data (SIMD) functionality is frequently integrated with the intention of performance improvement. In order to get an energy-efficient system consisting of processor and compiler, it is necessary to optimize hardware as well as software. It is not obvious that SIMD operations can save any energy: if n operations are executed in parallel, each of them might consume the same amount of energy as if there were executed sequentially. Up to now, no work has been done to investigate the influence of compiler generated code containing SIMD operations w.r.t. the energy consumption. This paper deals with the exploration of the energy saving potential of SIMD operations for a DSP by using a generic compilation framework including an integrated instruction level energy cost model for our target architecture. Effects of SIMD operations on the energy consumption are shown for several benchmarks and an MP3 application.},
} The growing use of digital signal processors (DSPs) in embedded systems necessitates the use of optimizing compilers supporting their special architecture features. Besides the irregular DSP architectures for reducing chip size and energy consumption, single instruction multiple data (SIMD) functionality is frequently integrated with the intention of performance improvement. In order to get an energy-efficient system consisting of processor and compiler, it is necessary to optimize hardware as well as software. It is not obvious that SIMD operations can save any energy: if n operations are executed in parallel, each of them might consume the same amount of energy as if there were executed sequentially. Up to now, no work has been done to investigate the influence of compiler generated code containing SIMD operations w.r.t. the energy consumption. This paper deals with the exploration of the energy saving potential of SIMD operations for a DSP by using a generic compilation framework including an integrated instruction level energy cost model for our target architecture. Effects of SIMD operations on the energy consumption are shown for several benchmarks and an MP3 application.
|
| Peter Marwedel and Birgit Sirocic. Bridges to computer architectures education. In Proceedings of the Workshop of Computer Architecture Education (WCAE) Munich, Germany, June 2004 [BibTeX][PDF][Abstract]@inproceedings { marwedel:04:wcae,
author = {Marwedel, Peter and Sirocic, Birgit},
title = {Bridges to computer architectures education},
booktitle = {Proceedings of the Workshop of Computer Architecture Education (WCAE)},
year = {2004},
address = {Munich, Germany},
month = {jun},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2004-wcae.pdf},
confidential = {n},
abstract = {Bridging the gap between the student's current knowledge and the technical details of computer systems is frequently required for the education of undergraduate students or students with a lack of previous technical knowledge. In this paper we will describe how to build bridges by the combination of introducing Flash-animations and the educational units of the RaVi system. In a first step the Flash-based animations make the students familiar with the underlaying principles. After that the student could jump to the more technical context by using the educational unit of RaVi. We have developed two bridges, one for explaining the principles of cache coherency protocols and the other for showing the concept of processor pipelines.},
} Bridging the gap between the student's current knowledge and the technical details of computer systems is frequently required for the education of undergraduate students or students with a lack of previous technical knowledge. In this paper we will describe how to build bridges by the combination of introducing Flash-animations and the educational units of the RaVi system. In a first step the Flash-based animations make the students familiar with the underlaying principles. After that the student could jump to the more technical context by using the educational unit of RaVi. We have developed two bridges, one for explaining the principles of cache coherency protocols and the other for showing the concept of processor pipelines.
|
| Heiko Falk and Peter Marwedel. Control Flow driven Splitting of Loop Nests at the Source Code Level. In Design, Automation and Test in Europe (DATE) 2003, pages 410-415 Munich/Germany, March 2003 [BibTeX][PDF][Abstract]@inproceedings { falk:03:date,
author = {Falk, Heiko and Marwedel, Peter},
title = {Control Flow driven Splitting of Loop Nests at the Source Code Level},
booktitle = {Design, Automation and Test in Europe (DATE) 2003},
year = {2003},
pages = {410-415},
address = {Munich/Germany},
month = {mar},
keywords = {sco},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2003-date.pdf},
confidential = {n},
abstract = {This paper presents a novel source code transformation for control flow optimization called loop nest splitting which minimizes the number of executed if-statements in loop nests of embedded multimedia applications. The goal of the optimization is to reduce runtimes and energy consumption. The analysis techniques are based on precise mathematical models combined with genetic algorithms. Due to the inherent portability of source code transformations, a very detailed benchmarking using 10 different processors can be performed. The application of our implemented algorithms to three real-life multimedia benchmarks leads to average speed-ups by 23.6\% - 62.1\% and energy savings by 19.6\% - 57.7\%. Furthermore, our optimization also leads to advantageous pipeline and cache performance.},
} This paper presents a novel source code transformation for control flow optimization called loop nest splitting which minimizes the number of executed if-statements in loop nests of embedded multimedia applications. The goal of the optimization is to reduce runtimes and energy consumption. The analysis techniques are based on precise mathematical models combined with genetic algorithms. Due to the inherent portability of source code transformations, a very detailed benchmarking using 10 different processors can be performed. The application of our implemented algorithms to three real-life multimedia benchmarks leads to average speed-ups by 23.6% - 62.1% and energy savings by 19.6% - 57.7%. Furthermore, our optimization also leads to advantageous pipeline and cache performance.
|
| Manish Verma, Lars Wehmeyer and Peter Marwedel. Efficient Scratchpad Allocation Algorithms for Energy Constrained Embedded Systems. In PACS 2003 San Diego, CA 2003, June 2003, Also in: Lecture Notes in Computer Science (LCNS 3164) Vol. 3164/2004 [BibTeX][PDF][Abstract]@inproceedings { verma:03:pacs,
author = {Verma, Manish and Wehmeyer, Lars and Marwedel, Peter},
title = {Efficient Scratchpad Allocation Algorithms for Energy Constrained Embedded Systems},
booktitle = {PACS 2003},
year = {2003},
address = {San Diego, CA 2003},
month = {jun},
note = {Also in: Lecture Notes in Computer Science (LCNS 3164) Vol. 3164/2004},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2003-pacs.pdf},
confidential = {n},
abstract = {In the context of portable embedded systems, reducing energy is one of the prime objectives. Memories are responsible for a significant percentage of a system's aggregate energy consumption. Consequently, novel memories as well as novel memory hierarchies are being designed to reduce the energy consumption. Caches and scratchpads are two contrasting variants of memory architectures. The former relies completely on hardware logic while the latter requires software for its utilization. Most high-end embedded microprocessors today include onchip instruction and data caches along with a scratchpad. Previous software approaches for utilizing scratchpad did not consider caches and hence fail for the prevalent high-end system architectures. In this work, we use the scratchpad for storing instructions. We solve the allocation problem using a greedy heuristic and also solve it optimally using an ILP formulation. We report an average reduction of 20.7\% in instruction memory energy consumption compared to a previously published technique. Larger reductions are also reported when the problem is solved optimally. The scratchpad in the presented architecture is similar to a preloaded loop cache. Comparing the energy consumption of our approach against that of preloaded loop caches, we report average energy savings of 28.9\% using the heuristic.},
} In the context of portable embedded systems, reducing energy is one of the prime objectives. Memories are responsible for a significant percentage of a system's aggregate energy consumption. Consequently, novel memories as well as novel memory hierarchies are being designed to reduce the energy consumption. Caches and scratchpads are two contrasting variants of memory architectures. The former relies completely on hardware logic while the latter requires software for its utilization. Most high-end embedded microprocessors today include onchip instruction and data caches along with a scratchpad. Previous software approaches for utilizing scratchpad did not consider caches and hence fail for the prevalent high-end system architectures. In this work, we use the scratchpad for storing instructions. We solve the allocation problem using a greedy heuristic and also solve it optimally using an ILP formulation. We report an average reduction of 20.7% in instruction memory energy consumption compared to a previously published technique. Larger reductions are also reported when the problem is solved optimally. The scratchpad in the presented architecture is similar to a preloaded loop cache. Comparing the energy consumption of our approach against that of preloaded loop caches, we report average energy savings of 28.9% using the heuristic.
|
| Peter Marwedel and Birgit Sirocic. Overcoming the Limitations of Traditional Media For Teaching Modern Processor Design. In International Conference on Microelectronic Systems Education (MSE 2003) Anaheim, June 2003 [BibTeX][PDF][Abstract]@inproceedings { marwedel:03:mse,
author = {Marwedel, Peter and Sirocic, Birgit},
title = {Overcoming the Limitations of Traditional Media For Teaching Modern Processor Design},
booktitle = {International Conference on Microelectronic Systems Education (MSE 2003)},
year = {2003},
address = {Anaheim},
month = {jun},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2003-mse.pdf},
confidential = {n},
abstract = {Understanding modern processors requires a good knowledge of the dynamic behavior of processors. Traditional media like books can be used for describing the dynamic behavior of processors. Visualization of this behavior, however, is impossible, due to the static nature of books. In this paper, we describe a Java-based tool for visualizing the dynamic behavior of hardware structures, called RaVi (abbreviation for the German equivalent of "computer architecture visualization"). Available RaVi components include models of a microcoded MIPS architecture, of a MIPS pipeline, of scoreboarding, Tomasulo's algorithm and the MESI multiprocessor cache protocol. These models were found to be more useful than general simulators in classroom use. The Java-based design also enables Internet-based distance learning. Tools are available at http://ls12-www.cs.tu-dortmund.de/ravi.},
} Understanding modern processors requires a good knowledge of the dynamic behavior of processors. Traditional media like books can be used for describing the dynamic behavior of processors. Visualization of this behavior, however, is impossible, due to the static nature of books. In this paper, we describe a Java-based tool for visualizing the dynamic behavior of hardware structures, called RaVi (abbreviation for the German equivalent of "computer architecture visualization"). Available RaVi components include models of a microcoded MIPS architecture, of a MIPS pipeline, of scoreboarding, Tomasulo's algorithm and the MESI multiprocessor cache protocol. These models were found to be more useful than general simulators in classroom use. The Java-based design also enables Internet-based distance learning. Tools are available at http://ls12-www.cs.tu-dortmund.de/ravi.
|
| Manish Verma, Stefan Steinke and Peter Marwedel. Data Partitioning for Maximal Scratchpad Usage. In ASPDAC 2003 KitaKyushu/Japan, January 2003 [BibTeX][PDF][Abstract]@inproceedings { verma:03:aspdac,
author = {Verma, Manish and Steinke, Stefan and Marwedel, Peter},
title = {Data Partitioning for Maximal Scratchpad Usage},
booktitle = {ASPDAC 2003},
year = {2003},
address = {KitaKyushu/Japan},
month = {jan},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2003-aspdac.pdf},
confidential = {n},
abstract = {The energy consumption for Mobile Embedded Systems is a limiting factor because of today's battery capacities. The memory subsystem consumes a large chunk of the energy, necessitating its efficient utilization. Energy efficient scratchpads are thus becoming common, though unlike caches they require to be explicitly utilized. In this paper, an algorithm integrated into a compiler is presented which analyzes the application, partitions an array variable whenever its beneficial, appropriately modifies the application and selects the best set of variables and program parts to be placed onto the scratchpad. Results show an energy improvement between 5.7\% and 17.6\% for a variety of applications against a previously known algorithm.},
} The energy consumption for Mobile Embedded Systems is a limiting factor because of today's battery capacities. The memory subsystem consumes a large chunk of the energy, necessitating its efficient utilization. Energy efficient scratchpads are thus becoming common, though unlike caches they require to be explicitly utilized. In this paper, an algorithm integrated into a compiler is presented which analyzes the application, partitions an array variable whenever its beneficial, appropriately modifies the application and selects the best set of variables and program parts to be placed onto the scratchpad. Results show an energy improvement between 5.7% and 17.6% for a variety of applications against a previously known algorithm.
|
| Rainer Leupers, Oliver Wahlen, Manuel Hohenauer, Tim Kogel and Peter Marwedel. An Executable Intermediate Representation for Retargetable Compilation and High-Level Code Optimization. In International Conference on Information Communication Technologies in Education (SAMOS 2003) June 2003 [BibTeX][PDF][Abstract]@inproceedings { leupers:03:samos,
author = {Leupers, Rainer and Wahlen, Oliver and Hohenauer, Manuel and Kogel, Tim and Marwedel, Peter},
title = {An Executable Intermediate Representation for Retargetable Compilation and High-Level Code Optimization},
booktitle = {International Conference on Information Communication Technologies in Education (SAMOS 2003)},
year = {2003},
month = {jun},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2003-samosIII.pdf},
confidential = {n},
abstract = {Due to fast time-to-market and IP reuse requirements, an increasing amount of the functionality of embedded HW/SW systems is implemented in software. As a consequence, software programming languages like C play an important role in system specification, design, and validation. Besides many other advantages, the C language offers executable specifications, with clear semantics and high simulation speed. However, virtually any tool operating on C specifications has to convert C sources into some intermediate representation (IR), during which the executability is normally lost. In order to overcome this problem, this paper describes a novel IR format, called IR-C, for the use in C based design tools, which combines the simplicity of three address code with the executability of C. Besides the IR-C format and its generation from ANSI C, we also describe its applications in the areas of validation, retargetable compilation, and sourcelevel code optimization.},
} Due to fast time-to-market and IP reuse requirements, an increasing amount of the functionality of embedded HW/SW systems is implemented in software. As a consequence, software programming languages like C play an important role in system specification, design, and validation. Besides many other advantages, the C language offers executable specifications, with clear semantics and high simulation speed. However, virtually any tool operating on C specifications has to convert C sources into some intermediate representation (IR), during which the executability is normally lost. In order to overcome this problem, this paper describes a novel IR format, called IR-C, for the use in C based design tools, which combines the simplicity of three address code with the executability of C. Besides the IR-C format and its generation from ANSI C, we also describe its applications in the areas of validation, retargetable compilation, and sourcelevel code optimization.
|
| Peter Marwedel and Birgit Sirocic. Multimedia componets for the visualization of dynamic behavior in computer architectures. In Proceedings of the Workshop of Computer Architecture Education (WCAE'03) San Diego, CA, June 2003 [BibTeX][PDF][Abstract]@inproceedings { marwedel:03:wcae,
author = {Marwedel, Peter and Sirocic, Birgit},
title = {Multimedia componets for the visualization of dynamic behavior in computer architectures},
booktitle = {Proceedings of the Workshop of Computer Architecture Education (WCAE'03)},
year = {2003},
address = {San Diego, CA},
month = {jun},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2003-wcae.pdf},
confidential = {n},
abstract = {Understanding modern processors requires a good knowledge of the dynamic behavior of processors. Traditional media like books use text for describing the dynamic behavior of processors. Visualization of this behavior, however, in impossible, due to the static nature of books. In this paper, we describe multimedia components for visualizing the dynamic behavior of hardware structures, called RaVi (abbreviation for the German equivalent of "computer architecture visualization"). Available RaVi components include models of a microcoded MIPS architecture, of a MIPS pipeline, of scoreboarding, Tomasulo's algorithm and the MESI multiprocessor cache protocol.},
} Understanding modern processors requires a good knowledge of the dynamic behavior of processors. Traditional media like books use text for describing the dynamic behavior of processors. Visualization of this behavior, however, in impossible, due to the static nature of books. In this paper, we describe multimedia components for visualizing the dynamic behavior of hardware structures, called RaVi (abbreviation for the German equivalent of "computer architecture visualization"). Available RaVi components include models of a microcoded MIPS architecture, of a MIPS pipeline, of scoreboarding, Tomasulo's algorithm and the MESI multiprocessor cache protocol.
|
| Stefan Steinke, Lars Wehmeyer, Bo-Sik Lee and Peter Marwedel. Assigning Program and Data Objects to Scratchpad for Energy Reduction. In DATE 2002 Paris/France, March 2002 [BibTeX][PDF][Abstract]@inproceedings { steinke:02:date,
author = {Steinke, Stefan and Wehmeyer, Lars and Lee, Bo-Sik and Marwedel, Peter},
title = {Assigning Program and Data Objects to Scratchpad for Energy Reduction},
booktitle = {DATE 2002},
year = {2002},
address = {Paris/France},
month = {mar},
keywords = {ecc},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2002-date.pdf},
confidential = {n},
abstract = {The number of embedded systems is increasing and a remarkable percentage is designed as mobile applications. For the latter, the energy consumption is a limiting factor because of today's battery capacities. Besides the processor, memory accesses consume a high amount of energy. The use of additional less power hungry memories like caches or scratchpads is thus common. Caches incorporate the hardware control logic for moving data in and out automatically. On the other hand, this logic requires chip area and energy. A scratchpad memory is much more energy efficient, but there is a need for software control of its content. In this paper, an algorithm integrated into a compiler is presented which analyses the application and selects program and data parts which are placed into the scratchpad. Comparisons against a cache solution show remarkable advantages between 12\% and 43\% in energy consumption for designs of the same memory size.},
} The number of embedded systems is increasing and a remarkable percentage is designed as mobile applications. For the latter, the energy consumption is a limiting factor because of today's battery capacities. Besides the processor, memory accesses consume a high amount of energy. The use of additional less power hungry memories like caches or scratchpads is thus common. Caches incorporate the hardware control logic for moving data in and out automatically. On the other hand, this logic requires chip area and energy. A scratchpad memory is much more energy efficient, but there is a need for software control of its content. In this paper, an algorithm integrated into a compiler is presented which analyses the application and selects program and data parts which are placed into the scratchpad. Comparisons against a cache solution show remarkable advantages between 12% and 43% in energy consumption for designs of the same memory size.
|
| Stefan Steinke, Nils Grunwald, Lars Wehmeyer, Rajeshwari Banakar, M. Balakrishnan and Peter Marwedel. Reducing Energy Consumption by Dynamic Copying of Instructions onto Onchip Memory. In ISSS 2002 Kyoto/Japan, October 2002 [BibTeX][PDF][Abstract]@inproceedings { steinke:02:isss,
author = {Steinke, Stefan and Grunwald, Nils and Wehmeyer, Lars and Banakar, Rajeshwari and Balakrishnan, M. and Marwedel, Peter},
title = {Reducing Energy Consumption by Dynamic Copying of Instructions onto Onchip Memory},
booktitle = {ISSS 2002},
year = {2002},
address = {Kyoto/Japan},
month = {oct},
keywords = {ecc},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2002-isss.pdf},
confidential = {n},
abstract = {The number of mobile embedded systems is increasing and all of them are limited in their uptime by their battery capacity. Several hardware changes have been introduced during the last years, but the steadily growing functionality still requires further energy reductions, e.g. by software optimizations. A significant amount of energy can be saved in the memory hierarchy where most of the energy is consumed. In this paper a new software technique is presented which supports the use of an onchip scratchpad memory by dynamically copying program parts into it. The set of selected program parts are determined with an optimal algorithm using integer linear programming. Experimental results show a reduction of the energy consumption by nearly 30\%, a performance increase by 25\% against a common cache system and energy improvements against a static approach of up to 38\%.},
} The number of mobile embedded systems is increasing and all of them are limited in their uptime by their battery capacity. Several hardware changes have been introduced during the last years, but the steadily growing functionality still requires further energy reductions, e.g. by software optimizations. A significant amount of energy can be saved in the memory hierarchy where most of the energy is consumed. In this paper a new software technique is presented which supports the use of an onchip scratchpad memory by dynamically copying program parts into it. The set of selected program parts are determined with an optimal algorithm using integer linear programming. Experimental results show a reduction of the energy consumption by nearly 30%, a performance increase by 25% against a common cache system and energy improvements against a static approach of up to 38%.
|
| Rajeshwari Banakar, Stefan Steinke, Bo-Sik Lee, M. Balakrishnan and Peter Marwedel. Scratchpad Memory : A Design Alternative for Cache On-chip memory in Embedded Systems. In CODES Estes Park (Colorado), May 2002 [BibTeX][PDF][Abstract]@inproceedings { banakar:02:codes,
author = {Banakar, Rajeshwari and Steinke, Stefan and Lee, Bo-Sik and Balakrishnan, M. and Marwedel, Peter},
title = {Scratchpad Memory : A Design Alternative for Cache On-chip memory in Embedded Systems},
booktitle = {CODES},
year = {2002},
address = {Estes Park (Colorado)},
month = {may},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2002-codes.pdf},
confidential = {n},
abstract = {In this paper we address the problem of on-chip memory selection for computationally intensive applications, by proposing scratch pad memory as an alternative to cache. Area and energy for different scratch pad and cache sizes are computed using the CACTI tool while performance was evaluated using the trace results of the simulator. The target processor chosen for evaluation was AT91M40400. The results clearly establish scratchpad memory as a low power alternative in most situations with an average energy reduction of 40\%. Further the average area-time reduction for the scratchpad memory was 46\% of the cache memory.},
} In this paper we address the problem of on-chip memory selection for computationally intensive applications, by proposing scratch pad memory as an alternative to cache. Area and energy for different scratch pad and cache sizes are computed using the CACTI tool while performance was evaluated using the trace results of the simulator. The target processor chosen for evaluation was AT91M40400. The results clearly establish scratchpad memory as a low power alternative in most situations with an average energy reduction of 40%. Further the average area-time reduction for the scratchpad memory was 46% of the cache memory.
|
| Peter Marwedel and Luca Benini. Low-power/low-energy embedded software. In Tutorial at Design, Automation and Test in Europe (DATE) Paris, 2002, March 2002 [BibTeX][Abstract]@inproceedings { marwedel:02:date,
author = {Marwedel, Peter and Benini, Luca},
title = {Low-power/low-energy embedded software},
booktitle = {Tutorial at Design, Automation and Test in Europe (DATE)},
year = {2002},
address = {Paris, 2002},
month = {mar},
confidential = {n},
abstract = {The size of the embedded system market is expected to grow significantly in the next years. For many of these systems, power and energy management are becoming primary issues due to rapidly increasing computational requirement and the slow rate of improvements in battery technology. Hence, researchers and designers have to be aware of approaches for reducing the power and energy consumption of embedded systems. In addition to standard techniques used at the device and the circuit level, new techniques are required which exploit the fact that a major part of the functionality of embedded systems is implemented in software. There are various levels at which the energy consumption of software can be considered, including the high-level algorithm descriptions, compilers, and operating systems. These levels will be covered in the tutorial. Summary: With a focus on programmable embedded systems, this tutorial will: \begin{itemize} \item survey the interaction of architecture, operating systems, compilers and memories from a power/energy focus, \item present specific contributors of each part to power and energy, and \item outline software techniques for minimization of power/energy. \end{itemize} Outline: In the first section, an introduction to the topic will be provided. Due to the large influence of the memory architecture on the total energy consumption, different memory architectures will be presented next. We will show how partitioned memories can help reducing the energy consumption. In the next section, we will describe how partitioned memory architectures and other features of embedded systems can be exploited in compilers. This includes the exploitation of scratch-pad memories and a comparison between scratch-pads and caches. In addition, this includes an analysis of the size of register files. Furthermore, we will explain techniques for reducing the memory traffic by global optimizations designed for multimedia applications. This section also comprises a description of applicable standard compiler optimizations and their potential for energy reductions as well as a brief introduction to compression techniques. The final section describes system software and real-time operating system (RTOS) issues. This will include hardware for RTOS-based power management, software support for power management, power-aware process scheduling and power-aware device management. Exploitation of application-specific information and power management of distributed systems will also be covered. Audience: Researchers starting to work on embedded systems or on low power design techniques; designers interested in getting to know available methodologies for low power software design.},
} The size of the embedded system market is expected to grow significantly in the next years. For many of these systems, power and energy management are becoming primary issues due to rapidly increasing computational requirement and the slow rate of improvements in battery technology. Hence, researchers and designers have to be aware of approaches for reducing the power and energy consumption of embedded systems. In addition to standard techniques used at the device and the circuit level, new techniques are required which exploit the fact that a major part of the functionality of embedded systems is implemented in software. There are various levels at which the energy consumption of software can be considered, including the high-level algorithm descriptions, compilers, and operating systems. These levels will be covered in the tutorial. Summary: With a focus on programmable embedded systems, this tutorial will: itemize \item survey the interaction of architecture, operating systems, compilers and memories from a power/energy focus, \item present specific contributors of each part to power and energy, and \item outline software techniques for minimization of power/energy. itemize Outline: In the first section, an introduction to the topic will be provided. Due to the large influence of the memory architecture on the total energy consumption, different memory architectures will be presented next. We will show how partitioned memories can help reducing the energy consumption. In the next section, we will describe how partitioned memory architectures and other features of embedded systems can be exploited in compilers. This includes the exploitation of scratch-pad memories and a comparison between scratch-pads and caches. In addition, this includes an analysis of the size of register files. Furthermore, we will explain techniques for reducing the memory traffic by global optimizations designed for multimedia applications. This section also comprises a description of applicable standard compiler optimizations and their potential for energy reductions as well as a brief introduction to compression techniques. The final section describes system software and real-time operating system (RTOS) issues. This will include hardware for RTOS-based power management, software support for power management, power-aware process scheduling and power-aware device management. Exploitation of application-specific information and power management of distributed systems will also be covered. Audience: Researchers starting to work on embedded systems or on low power design techniques; designers interested in getting to know available methodologies for low power software design.
|
| Peter Marwedel and Srinivas Devadas (eds.). LCTES'02-SCOPES'02: Joint Conference on Languages, Compilers and Tools for Embedded Systems & Software and Compilers for Embedded Systems. In ACM Press June 2002 [BibTeX][Abstract]@inproceedings { marwedel:02:lctes,
author = {Marwedel, Peter and Devadas (eds.), Srinivas},
title = {LCTES'02-SCOPES'02: Joint Conference on Languages, Compilers and Tools for Embedded Systems \\& Software and Compilers for Embedded Systems},
booktitle = {ACM Press},
year = {2002},
month = {jun},
confidential = {n},
abstract = {(Draft version of the preface, final layout is different): This volume contains the proceedings of the Joint Languages, Compilers, and Tools for Embedded Systems (LCTES'02) and Software and Compilers for Embedded Systems (SCOPES'02) Conference. LCTES/SCOPES'02 took place in Berlin during June 19th to the 21st. For the first time, LCTES and SCOPES were held together, resulting in stimulating contacts between researchers predominantly having a background in programming languages and electronic design automation, respectively. Also, for the very first time, LCTES was held as a conference and not as a workshop. LCTES/SCOPES'02 received a total of 73 papers. During a comprehensive review process a total of 234 reviews were submitted. Finally, 25 papers were accepted and included in the resulting high-quality program. Accepted papers covered the following areas: compilers including low-energy compilation, synthesis, design space exploration, debugging and validation, code generation and register allocation, processor modeling, hardware/software codesign, and real-time scheduling. Accepted papers were grouped into 10 sessions. In addition, two invited keynotes given by Dr. Philippe Magarshack of STMicroelectronics and Prof. Gerhard Fettweis of Systemonic AG and of Dresden University emphasized industrial view points and perspectives. We thank the following members of the program committee for their efforts in reviewing the papers: David August, Shuvra S. Bhattacharyya, Raul Camposano, Keith D. Cooper, Rajiv Gupta, Mary Hall, Seongsoo Hong, Masaharu Imai, Ahmed Jerraya, Jochen Jess, Kurt Keutzer, Rainer Leupers, Annie Liu, Jef van Meerbergen, SangLyul Min, Frank Mueller, Tatsuo Nakajima, Alex Nicolau, Santosh Pande, Manas Saksena, Bob Rau, Wolfgang Rosenstiel, Sreeranga P. Rajan, Gang-Ryung Uh, Carl Von Platen, Bernard Wess, David Whalley, Reinhard Wilhelm, and Hiroto Yasuura. We thank Jens Knoop of the University of Dortmund who served as the Finance Chair, Ahmed Jerraya of TIMA, Grenoble who served as the Publicity Chair for their efforts in organizing LCTES/SCOPES'02 and making it a success. Thanks to Armin Zimmermann of the Technical University of Berlin for organizing the social event in LCTES/SCOPES'02. We acknowledge the sponsorship of ACM SIGPLAN. ACM also provided travel grants to students. The European Design Automation Association (EDAA) provided an in-cooperation status to LCTES/SCOPES. Finally, we thank the steering committee for their efforts in ensuring continuity from the previous LCTES workshops. Peter Marwedel, University of Dortmund \\ Srinivas Devadas, MIT \\ Co-Chairs of LCTES/SCOPES'02},
} (Draft version of the preface, final layout is different): This volume contains the proceedings of the Joint Languages, Compilers, and Tools for Embedded Systems (LCTES'02) and Software and Compilers for Embedded Systems (SCOPES'02) Conference. LCTES/SCOPES'02 took place in Berlin during June 19th to the 21st. For the first time, LCTES and SCOPES were held together, resulting in stimulating contacts between researchers predominantly having a background in programming languages and electronic design automation, respectively. Also, for the very first time, LCTES was held as a conference and not as a workshop. LCTES/SCOPES'02 received a total of 73 papers. During a comprehensive review process a total of 234 reviews were submitted. Finally, 25 papers were accepted and included in the resulting high-quality program. Accepted papers covered the following areas: compilers including low-energy compilation, synthesis, design space exploration, debugging and validation, code generation and register allocation, processor modeling, hardware/software codesign, and real-time scheduling. Accepted papers were grouped into 10 sessions. In addition, two invited keynotes given by Dr. Philippe Magarshack of STMicroelectronics and Prof. Gerhard Fettweis of Systemonic AG and of Dresden University emphasized industrial view points and perspectives. We thank the following members of the program committee for their efforts in reviewing the papers: David August, Shuvra S. Bhattacharyya, Raul Camposano, Keith D. Cooper, Rajiv Gupta, Mary Hall, Seongsoo Hong, Masaharu Imai, Ahmed Jerraya, Jochen Jess, Kurt Keutzer, Rainer Leupers, Annie Liu, Jef van Meerbergen, SangLyul Min, Frank Mueller, Tatsuo Nakajima, Alex Nicolau, Santosh Pande, Manas Saksena, Bob Rau, Wolfgang Rosenstiel, Sreeranga P. Rajan, Gang-Ryung Uh, Carl Von Platen, Bernard Wess, David Whalley, Reinhard Wilhelm, and Hiroto Yasuura. We thank Jens Knoop of the University of Dortmund who served as the Finance Chair, Ahmed Jerraya of TIMA, Grenoble who served as the Publicity Chair for their efforts in organizing LCTES/SCOPES'02 and making it a success. Thanks to Armin Zimmermann of the Technical University of Berlin for organizing the social event in LCTES/SCOPES'02. We acknowledge the sponsorship of ACM SIGPLAN. ACM also provided travel grants to students. The European Design Automation Association (EDAA) provided an in-cooperation status to LCTES/SCOPES. Finally, we thank the steering committee for their efforts in ensuring continuity from the previous LCTES workshops. Peter Marwedel, University of Dortmund \ Srinivas Devadas, MIT \ Co-Chairs of LCTES/SCOPES'02
|
| Peter Marwedel, Khac Dung Cong and Sergej Schwenk. RAVI: Interactive Visualization of information systems dynamics using a Java-based schematic editor and simulator. In Open IFIP-GI-Conference on Social, Ethical and Cognitive Issues of Informatics and ICT (Information and Communication Technologies), SECIII June 2002 [BibTeX][PDF][Abstract]@inproceedings { marwedel:02:seciii,
author = {Marwedel, Peter and Cong, Khac Dung and Schwenk, Sergej},
title = {RAVI: Interactive Visualization of information systems dynamics using a Java-based schematic editor and simulator},
booktitle = {Open IFIP-GI-Conference on Social, Ethical and Cognitive Issues of Informatics and ICT (Information and Communication Technologies), SECIII},
year = {2002},
month = {jun},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2002-seciii.pdf},
confidential = {n},
abstract = {One of the key limitations of traditional media, e.g. books, for teaching is their lack of visualizing the dynamic behavior of systems. Videos tapes and video distribution techniques have made it possible to show non-interactive media elements to students. However, one of the key advantages of multimedia systems over traditional media is their potential of providing interactiveness. This interactiveness, however, is not easy to obtain, since it requires the simulation of the system to be visualized. Designing simulators can be a challenging task that can not be solved within the time-frame usually allocated for multi-media projects. On the other hand, available simulators are not suitable for classroom use. They are frequently designed for optimum simulation speed and complex design projects. Ease of use, excellent visualization and portability have normally not been top goals for simulator design. Also, powerful simulators are typically proprietary and come at high costs, preventing their widespread deployment to class rooms and into the hands of students.},
} One of the key limitations of traditional media, e.g. books, for teaching is their lack of visualizing the dynamic behavior of systems. Videos tapes and video distribution techniques have made it possible to show non-interactive media elements to students. However, one of the key advantages of multimedia systems over traditional media is their potential of providing interactiveness. This interactiveness, however, is not easy to obtain, since it requires the simulation of the system to be visualized. Designing simulators can be a challenging task that can not be solved within the time-frame usually allocated for multi-media projects. On the other hand, available simulators are not suitable for classroom use. They are frequently designed for optimum simulation speed and complex design projects. Ease of use, excellent visualization and portability have normally not been top goals for simulator design. Also, powerful simulators are typically proprietary and come at high costs, preventing their widespread deployment to class rooms and into the hands of students.
|
| Peter Marwedel. Embedded Software: How to make it efficient?. In Proceedings of the Euromicro Symposium on Digital System Design Dortmund, 2002 [BibTeX][PDF][Abstract]@inproceedings { marwedel:02:euromicro,
author = {Marwedel, Peter},
title = {Embedded Software: How to make it efficient?},
booktitle = {Proceedings of the Euromicro Symposium on Digital System Design},
year = {2002},
address = {Dortmund},
publisher = {IEEE Design \& Test},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2002-euromicro.pdf},
confidential = {n},
abstract = {This paper stresses the importance of designing efficient embedded software and it provides a global view of some of the techniques that have been developed to meet this goal. These techniques include high-level transformations, compiler optimizations reducing the energy consumption of embedded programs and optimizations exploiting architectural features of embedded processors. Such optimizations lead to significant reductions of the execution time, the required energy and the memory size of embedded applications. Despite this, they can hardly be found in any available compiler. particular application) and processors and software. The latter is used to meet the flexibility requirements found for almost all of today's applications.},
} This paper stresses the importance of designing efficient embedded software and it provides a global view of some of the techniques that have been developed to meet this goal. These techniques include high-level transformations, compiler optimizations reducing the energy consumption of embedded programs and optimizations exploiting architectural features of embedded processors. Such optimizations lead to significant reductions of the execution time, the required energy and the memory size of embedded applications. Despite this, they can hardly be found in any available compiler. particular application) and processors and software. The latter is used to meet the flexibility requirements found for almost all of today's applications.
|
| Manoj Kumar Jain, Lars Wehmeyer, Stefan Steinke, Peter Marwedel and M. Balakrishnan. Evaluating Register File Size in ASIP Design. In CODES Copenhagen (Denmark), April 2001 [BibTeX][PDF][Abstract]@inproceedings { jain:01:codes,
author = {Jain, Manoj Kumar and Wehmeyer, Lars and Steinke, Stefan and Marwedel, Peter and Balakrishnan, M.},
title = {Evaluating Register File Size in ASIP Design},
booktitle = {CODES},
year = {2001},
address = {Copenhagen (Denmark)},
month = {apr},
keywords = {ecc},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2001-codes.pdf},
confidential = {n},
abstract = {Interest in synthesis of Application Specific Instruction Set Processors or ASIPs has increased considerably and a number of methodologies have been proposed for ASIP design. A key step in ASIP synthesis involves deciding architectural features based on application requirements and constraints. In this paper we observe the effect of changing register file size on the performance as well as power and energy consumption. Detailed data is generated and analyzed for a number of application programs. Results indicate that choice of an appropriate number of registers has a significant impact on performance.},
} Interest in synthesis of Application Specific Instruction Set Processors or ASIPs has increased considerably and a number of methodologies have been proposed for ASIP design. A key step in ASIP synthesis involves deciding architectural features based on application requirements and constraints. In this paper we observe the effect of changing register file size on the performance as well as power and energy consumption. Detailed data is generated and analyzed for a number of application programs. Results indicate that choice of an appropriate number of registers has a significant impact on performance.
|
| Markus Lorenz, Rainer Leupers, Peter Marwedel, Thorsten Dräger and Gerhard P. Fettweis. Low-Energy DSP Code Generation Using a Genetic Algorithm. In ICCD '01 Austin/Texas/USA, September 2001 [BibTeX][PDF][Abstract]@inproceedings { lorenz:01:iccd,
author = {Lorenz, Markus and Leupers, Rainer and Marwedel, Peter and Dr\"ager, Thorsten and Fettweis, Gerhard P.},
title = {Low-Energy DSP Code Generation Using a Genetic Algorithm},
booktitle = {ICCD '01},
year = {2001},
address = {Austin/Texas/USA},
month = {sep},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2001-iccd.pdf},
confidential = {n},
abstract = {This paper deals with low-energy code generation for a highly optimized digital signal processor designed for mobile communication applications. We present a genetic algorithm based code generator (GCG), and an instruction-level power model for this processor. Our code generator is capable of reducing the power dissipation of target applications by means of two techniques: First, GCG minimizes the number of memory accesses by using a special list-scheduling algorithm. This technique makes it possible to perform graph based code selection and to take into account the high interdependencies of the subtasks of code generation by phase coupling. In addition, GCG optimizes the scheduling of processor instructions with respect to the instruction-level power model based on a gate level simulation. Experimental results for several benchmarks show the effectiveness of our approach.},
} This paper deals with low-energy code generation for a highly optimized digital signal processor designed for mobile communication applications. We present a genetic algorithm based code generator (GCG), and an instruction-level power model for this processor. Our code generator is capable of reducing the power dissipation of target applications by means of two techniques: First, GCG minimizes the number of memory accesses by using a special list-scheduling algorithm. This technique makes it possible to perform graph based code selection and to take into account the high interdependencies of the subtasks of code generation by phase coupling. In addition, GCG optimizes the scheduling of processor instructions with respect to the instruction-level power model based on a gate level simulation. Experimental results for several benchmarks show the effectiveness of our approach.
|
| Markus Lorenz, David Kottmann, Steven Bashford, Rainer Leupers and Peter Marwedel. Optimized Address Assignment for DSPs with SIMD Memory Accesses. In ASP-DAC '01 Yokohama/Japan, January 2001 [BibTeX][PDF][Abstract]@inproceedings { lorenz:01:aspdac,
author = {Lorenz, Markus and Kottmann, David and Bashford, Steven and Leupers, Rainer and Marwedel, Peter},
title = {Optimized Address Assignment for DSPs with SIMD Memory Accesses},
booktitle = {ASP-DAC '01},
year = {2001},
address = {Yokohama/Japan},
month = {jan},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2001-aspdac.pdf},
confidential = {n},
abstract = {This paper deals with address assignment in code generation for digital signal processors (DSPs) with SIMD (single instruction multiple data) memory accesses. In these processors data are organized in groups (or partitions), whose elements share one common memory address. In order to optimize program performance for processors with such memory architectures it is important to have a suitable memory layout of the variables. We propose a two-step address assignment technique for scalar variables using a genetic algorithm based partitioning method and a graph based heuristic which makes use of available DSP address generation hardware. We show that our address assignment techniques lead to a significant code quality improvement compared to heuristics.},
} This paper deals with address assignment in code generation for digital signal processors (DSPs) with SIMD (single instruction multiple data) memory accesses. In these processors data are organized in groups (or partitions), whose elements share one common memory address. In order to optimize program performance for processors with such memory architectures it is important to have a suitable memory layout of the variables. We propose a two-step address assignment technique for scalar variables using a genetic algorithm based partitioning method and a graph based heuristic which makes use of available DSP address generation hardware. We show that our address assignment techniques lead to a significant code quality improvement compared to heuristics.
|
| Peter Marwedel, Stefan Steinke and Lars Wehmeyer. Compilation techniques for energy-, code-size-, and run-time-efficient embedded software. In Int. Workshop on Advanced Compiler Techniques for High Performance and Embedded Processors Bucharest, Hungary, 2001 [BibTeX][Abstract]@inproceedings { marwedel:01:iwact,
author = {Marwedel, Peter and Steinke, Stefan and Wehmeyer, Lars},
title = {Compilation techniques for energy-, code-size-, and run-time-efficient embedded software},
booktitle = {Int. Workshop on Advanced Compiler Techniques for High Performance and Embedded Processors},
year = {2001},
address = {Bucharest, Hungary},
keywords = {ecc},
confidential = {n},
abstract = {This paper is motivated by two essential characteristics of embedded systems: the increasing amount of software that is used for implementing embedded systems and the need for implementing embedded systems efficiently. As a consequence, embedded software has to be efficient. In the following, we will present techniques for generating efficient machine code for architectures which are typically found in embedded systems. We will demonstrate, using examples, how compilers for embedded processors can exploit features that are found in embedded processors.},
} This paper is motivated by two essential characteristics of embedded systems: the increasing amount of software that is used for implementing embedded systems and the need for implementing embedded systems efficiently. As a consequence, embedded software has to be efficient. In the following, we will present techniques for generating efficient machine code for architectures which are typically found in embedded systems. We will demonstrate, using examples, how compilers for embedded processors can exploit features that are found in embedded processors.
|
| Stefan Steinke, Markus Knauer, Lars Wehmeyer and Peter Marwedel. An Accurate and Fine Grain Instruction-Level Energy Model Supporting Software Optimizations. In PATMOS Yverdon (Switzerland), September 2001 [BibTeX][PDF][Abstract]@inproceedings { steinke:01:patmos,
author = {Steinke, Stefan and Knauer, Markus and Wehmeyer, Lars and Marwedel, Peter},
title = {An Accurate and Fine Grain Instruction-Level Energy Model Supporting Software Optimizations},
booktitle = {PATMOS},
year = {2001},
address = {Yverdon (Switzerland)},
month = {sep},
keywords = {ecc},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2001-patmos.pdf},
confidential = {n},
abstract = {Power aware compilers have been under research during the last few years. However, there is still a need for accurate energy models for supporting software optimizations. In this paper we present a new energy model on the instruction level. As an addition to former models, the bit toggling on internal and external buses as well as accesses to off-chip memories are considered. To determine the characteristics, a measuring method is presented which can be used to establish the energy model without detailed knowledge of the internal processor structures. Finally, the proposed energy model is established for the ARM7TDMI RISC processor.},
} Power aware compilers have been under research during the last few years. However, there is still a need for accurate energy models for supporting software optimizations. In this paper we present a new energy model on the instruction level. As an addition to former models, the bit toggling on internal and external buses as well as accesses to off-chip memories are considered. To determine the characteristics, a measuring method is presented which can be used to establish the energy model without detailed knowledge of the internal processor structures. Finally, the proposed energy model is established for the ARM7TDMI RISC processor.
|
| Peter Marwedel. Compilation Techniques for Embedded Software. In Invited Tutorial, Asia and South Pacific Design Automation Conference (ASP-DAC) Yokohama, Japan, 2001 [BibTeX][Abstract]@inproceedings { marwedel:01:aspdac,
author = {Marwedel, Peter},
title = {Compilation Techniques for Embedded Software},
booktitle = {Invited Tutorial, Asia and South Pacific Design Automation Conference (ASP-DAC)},
year = {2001},
address = {Yokohama, Japan},
confidential = {n},
abstract = {Embedded systems demand for efficient processor architectures, optimized for application domains or applications. Current compiler technology supports these architectures poorly and has been recognized as a bottleneck for designing systems. Recent research projects aim at removing this bottleneck. We will present code optimization approaches taking the characteristics of embedded processor architectures into account. The following topics will be included: \begin{itemize} \item memory allocation techniques, \item compiler techniques for VLIW machines, \item compiler techniques aiming at energy minimization of embedded software, \item retargetability of compilers, \item code compression techniques. \end{itemize}},
} Embedded systems demand for efficient processor architectures, optimized for application domains or applications. Current compiler technology supports these architectures poorly and has been recognized as a bottleneck for designing systems. Recent research projects aim at removing this bottleneck. We will present code optimization approaches taking the characteristics of embedded processor architectures into account. The following topics will be included: itemize \item memory allocation techniques, \item compiler techniques for VLIW machines, \item compiler techniques aiming at energy minimization of embedded software, \item retargetability of compilers, \item code compression techniques. itemize
|
| Matthias Weiss, Gerhard Fettweis, Markus Lorenz, Rainer Leupers and Peter Marwedel. Toolumgebung fuer plattformbasierte DSPs der naechsten Generation. In DSP Deutschland Munich/Germany, September 1999 [BibTeX][PDF][Abstract]@inproceedings { weiss:1999:dsp,
author = {Weiss, Matthias and Fettweis, Gerhard and Lorenz, Markus and Leupers, Rainer and Marwedel, Peter},
title = {Toolumgebung fuer plattformbasierte DSPs der naechsten Generation},
booktitle = {DSP Deutschland},
year = {1999},
address = {Munich/Germany},
month = {sep},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1999-dspd.pdf},
confidential = {n},
abstract = {Digitale Signalprozessoren (DSPs) werden ueberall dort eingesetzt, wo bestimmte Performance- oder Aufwandsanforderungen nicht von Standardprozessoren erfuellt werden koennen. Beispiele sind Media- und Sprachcodecs, Modems, sowie Bild- und Spracherkennungen. Aufgrund der unterschiedlichen Performanceanforderungen dieser Anwendungen beginnen DSP-Hersteller nun, nicht einen DSP sondern eine Plattformloesung anzukuendigen. Beispiele sind der StarCore von Motorola/Lucent oder der TigerSharc von Analog Devices. Am Lehrstuhl Mobile Nachrichtensysteme der TU Dresden ist hierbei die M3-DSP Plattform entstanden. Eine Programmierumgebung fuer eine solche DSP-Plattform muss im Gegensatz zu konventionellen Ansaetzen auch die Anforderungen der Plattform mit unterstuetzen. In diesem Paper stellen wir die Architektur einer solchen Toolumgebung fuer die M3-Plattform vor und demonstrieren die Anwendbarkeit anhand von Beispielen.},
} Digitale Signalprozessoren (DSPs) werden ueberall dort eingesetzt, wo bestimmte Performance- oder Aufwandsanforderungen nicht von Standardprozessoren erfuellt werden koennen. Beispiele sind Media- und Sprachcodecs, Modems, sowie Bild- und Spracherkennungen. Aufgrund der unterschiedlichen Performanceanforderungen dieser Anwendungen beginnen DSP-Hersteller nun, nicht einen DSP sondern eine Plattformloesung anzukuendigen. Beispiele sind der StarCore von Motorola/Lucent oder der TigerSharc von Analog Devices. Am Lehrstuhl Mobile Nachrichtensysteme der TU Dresden ist hierbei die M3-DSP Plattform entstanden. Eine Programmierumgebung fuer eine solche DSP-Plattform muss im Gegensatz zu konventionellen Ansaetzen auch die Anforderungen der Plattform mit unterstuetzen. In diesem Paper stellen wir die Architektur einer solchen Toolumgebung fuer die M3-Plattform vor und demonstrieren die Anwendbarkeit anhand von Beispielen.
|
| U. Bieker, M. Kaibel, P. Marwedel and W. Geisselhardt. STAR-DUST: Hierarchical Test of Embedded Processors by Self-Test Programs. In European Test Workshop (ETW) June 1999 [BibTeX][PDF][Abstract]@inproceedings { bieker:1999:etw,
author = {Bieker, U. and Kaibel, M. and Marwedel, P. and Geisselhardt, W.},
title = {STAR-DUST: Hierarchical Test of Embedded Processors by Self-Test Programs},
booktitle = {European Test Workshop (ETW)},
year = {1999},
month = {jun},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1999-etw.pdf},
confidential = {n},
abstract = {This paper describes the hierarchical test-generation method STAR-DUST, using self-test program generator RESTART, test pattern generator DUST, fault simulator FAUST and SYNOPSYS logic synthesis tools. RESTART aims at supporting self-test of embedded processors. Its integration into the STAR-DUST environment allows test program generation for realistic fault assumptions and provides, for the first time, experimental data on the fault coverage that can be obtained for full processor models. Experimental data shows that fault masking is not a problem even though the considered processor has to perform result comparison and arithmetic operations in the same ALU.},
} This paper describes the hierarchical test-generation method STAR-DUST, using self-test program generator RESTART, test pattern generator DUST, fault simulator FAUST and SYNOPSYS logic synthesis tools. RESTART aims at supporting self-test of embedded processors. Its integration into the STAR-DUST environment allows test program generation for realistic fault assumptions and provides, for the first time, experimental data on the fault coverage that can be obtained for full processor models. Experimental data shows that fault masking is not a problem even though the considered processor has to perform result comparison and arithmetic operations in the same ALU.
|
| Sujit Dey Rajesh K. Gupta and Peter Marwedel. Embedded System Design and Validation: Building Systems from IC Cores to Chips (Tutorial). In Eleventh Int. conf on VLSI Design 1998 [BibTeX][Abstract]@inproceedings { gupta:1998:vlsi,
author = {Rajesh K. Gupta, Sujit Dey and Marwedel, Peter},
title = {Embedded System Design and Validation: Building Systems from IC Cores to Chips (Tutorial)},
booktitle = {Eleventh Int. conf on VLSI Design},
year = {1998},
confidential = {n},
abstract = {This tutorial addresses the challenges in the design and validation of an embedded system-on-a-chip. In particular, we examine the design and use of pre-designed, pre-characterized, and pre-verified silicon building blocks called `cores.' We examine design styles for portability and reuse of core cells, and how these can be validated for given applications. We examine the trend in IC cores by describing a wide range of available processor, DSP, interface and analog cores and how these are used in application domains such as multimedia and networkings systems. Covered topics include emulation, in-circuit emulation, compliance test environments, instruction-set simulation, and hardware-software co-simulation. We discuss the increasing software content on chips, made possible by proliferation of processor and DSP cores. We examine application-specific instruction-set (ASIP) and signal processors (ASSPs) and their use in specific application domains. We then present new code optimization approaches related to memory allocation and code compaction that exploit the embedded processor architectures. Finally, we present techniques for retargeting compilers to new architectures easily. In particular, we show how compilers can be generated from descriptions of processor architectures.},
} This tutorial addresses the challenges in the design and validation of an embedded system-on-a-chip. In particular, we examine the design and use of pre-designed, pre-characterized, and pre-verified silicon building blocks called `cores.' We examine design styles for portability and reuse of core cells, and how these can be validated for given applications. We examine the trend in IC cores by describing a wide range of available processor, DSP, interface and analog cores and how these are used in application domains such as multimedia and networkings systems. Covered topics include emulation, in-circuit emulation, compliance test environments, instruction-set simulation, and hardware-software co-simulation. We discuss the increasing software content on chips, made possible by proliferation of processor and DSP cores. We examine application-specific instruction-set (ASIP) and signal processors (ASSPs) and their use in specific application domains. We then present new code optimization approaches related to memory allocation and code compaction that exploit the embedded processor architectures. Finally, we present techniques for retargeting compilers to new architectures easily. In particular, we show how compilers can be generated from descriptions of processor architectures.
|
| Ralf Niemann and Peter Marwedel. Synthesis of Communicating Controllers for Concurrent Hardware/Software Systems. In Design, Automation and Test in Europe (DATE) Paris/France, February 1998 [BibTeX][PDF][Abstract]@inproceedings { niemann:1998:date,
author = {Niemann, Ralf and Marwedel, Peter},
title = {Synthesis of Communicating Controllers for Concurrent Hardware/Software Systems},
booktitle = {Design, Automation and Test in Europe (DATE)},
year = {1998},
address = {Paris/France},
month = {feb},
keywords = {hwsw},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1998-date-niemann.pdf},
confidential = {n},
abstract = {Two main aspects in hardware/software codesign are hardware/software partitioning and co-synthesis. Most codesign approaches work only on one of these problems. In this paper, an approach coupling hardware/software partitioning and co-synthesis will be presented, working fully-automatic. The techniques have been integrated in the codesign tool COOL (COdesign toOL) supporting the complete design flow from system specification to board-level implementation for multi-processor and multi-ASIC target architectures for data-flow dominated applications.},
} Two main aspects in hardware/software codesign are hardware/software partitioning and co-synthesis. Most codesign approaches work only on one of these problems. In this paper, an approach coupling hardware/software partitioning and co-synthesis will be presented, working fully-automatic. The techniques have been integrated in the codesign tool COOL (COdesign toOL) supporting the complete design flow from system specification to board-level implementation for multi-processor and multi-ASIC target architectures for data-flow dominated applications.
|
| Rainer Leupers and Peter Marwedel. Formale Methoden in der Codeerzeugung für digitale Signalprozessoren. In 5. GI/ITG/GMM Workshop "Methoden des Entwurfs und der Verifikation digitaler Systeme" Linz/Austria, April 1997 [BibTeX][PDF][Abstract]@inproceedings { eupers:1997:itg,
author = {Leupers, Rainer and Marwedel, Peter},
title = {Formale Methoden in der Codeerzeugung f\"ur digitale Signalprozessoren},
booktitle = {5. GI/ITG/GMM Workshop "Methoden des Entwurfs und der Verifikation digitaler Systeme"},
year = {1997},
address = {Linz/Austria},
month = {apr},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1997-gi-itg.pdf},
confidential = {n},
abstract = {Der Bereich HW/SW-Codesign f\"ur eingebettete Systeme umfa\"st neben Methoden zur HW/SW-Partitionierung und Hardwaresynthese notwendigerweise auch Techniken zur Codeerzeugung f\"ur eingebettete programmierbare Prozessoren. Speziell im Falle von digitalen Signalprozessoren (DSPs) ist die Qualit\"at verf\"ugbarer Compiler unzureichend. Zur Vermeidung von aufwendiger Programmierung auf Assemblerebene sind daher neue DSP-spezifische Codeerzeugungstechiken notwendig. Dieser Beitrag stellt den Compiler RECORD vor, welcher f\"ur eine Klasse von DSPs Hochsprachenprogramme in Maschinencode \"ubersetzt. Um den speziellen Anforderungen an Compiler f\"ur DSPs gerecht zu werden, werden teilweise formale Methoden eingesetzt. Wir stellen zwei solche f\"ur RECORD entwickelte Methoden vor, welche zur Analyse von Prozessormodellen sowie zur Code-Kompaktierung verwendet werden, und diskutieren deren praktische Anwendung.},
} Der Bereich HW/SW-Codesign für eingebettete Systeme umfaßt neben Methoden zur HW/SW-Partitionierung und Hardwaresynthese notwendigerweise auch Techniken zur Codeerzeugung für eingebettete programmierbare Prozessoren. Speziell im Falle von digitalen Signalprozessoren (DSPs) ist die Qualität verfügbarer Compiler unzureichend. Zur Vermeidung von aufwendiger Programmierung auf Assemblerebene sind daher neue DSP-spezifische Codeerzeugungstechiken notwendig. Dieser Beitrag stellt den Compiler RECORD vor, welcher für eine Klasse von DSPs Hochsprachenprogramme in Maschinencode übersetzt. Um den speziellen Anforderungen an Compiler für DSPs gerecht zu werden, werden teilweise formale Methoden eingesetzt. Wir stellen zwei solche für RECORD entwickelte Methoden vor, welche zur Analyse von Prozessormodellen sowie zur Code-Kompaktierung verwendet werden, und diskutieren deren praktische Anwendung.
|
| Rainer Leupers and Peter Marwedel. Optimierende Compiler für DSPs: Was ist verfügbar ? (in German). In DSP Deutschland '97 Munich/Germany, October 1997 [BibTeX][PDF][Abstract]@inproceedings { leupers:1997:dsp,
author = {Leupers, Rainer and Marwedel, Peter},
title = {Optimierende Compiler f\"ur DSPs: Was ist verf\"ugbar ? (in German)},
booktitle = {DSP Deutschland '97},
year = {1997},
address = {Munich/Germany},
month = {oct},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1997-dsp-deutschland.pdf},
confidential = {n},
abstract = {Die Softwareentwicklung f\"ur eingebettete Prozessoren findet heute gr\"o\"stenteils noch auf Assemblerebene statt. Der Grund f\"ur diesen langfristig wohl unhaltbaren Zustand liegt in der mangelnden Verf\"ugbarkeit von guten C-Compilern. In den letzten Jahren wurden allerdings wesentliche Fortschritte in der Codeoptimierung - speziell f\"ur DSPs - erzielt, welche bisher nur unzureichend in kommerzielle Produkte umgesetzt wurden. Dieser Beitrag zeigt die prinzipiellen Optimierungsquellen auf und fa\"st den Stand der Technik zusammen. Die zentralen Methoden hierbei sind komplexe Optimierungsverfahren, welche \"uber die traditionelle Compilertechnologie hinausgehen, sowie die Ausnutzung der DSP-spezifischen Hardware-Architekturen zur effizienten \"Ubersetzung von C-Sprachkonstrukten in DSP-Maschinenbefehle. Die genannten Verfahren lassen sich teilweise auch allgemein auf (durch Compiler generierte oder handgeschriebene) Assemblerprogramme anwenden.},
} Die Softwareentwicklung für eingebettete Prozessoren findet heute größtenteils noch auf Assemblerebene statt. Der Grund für diesen langfristig wohl unhaltbaren Zustand liegt in der mangelnden Verfügbarkeit von guten C-Compilern. In den letzten Jahren wurden allerdings wesentliche Fortschritte in der Codeoptimierung - speziell für DSPs - erzielt, welche bisher nur unzureichend in kommerzielle Produkte umgesetzt wurden. Dieser Beitrag zeigt die prinzipiellen Optimierungsquellen auf und faßt den Stand der Technik zusammen. Die zentralen Methoden hierbei sind komplexe Optimierungsverfahren, welche über die traditionelle Compilertechnologie hinausgehen, sowie die Ausnutzung der DSP-spezifischen Hardware-Architekturen zur effizienten \"Ubersetzung von C-Sprachkonstrukten in DSP-Maschinenbefehle. Die genannten Verfahren lassen sich teilweise auch allgemein auf (durch Compiler generierte oder handgeschriebene) Assemblerprogramme anwenden.
|
| Rainer Leupers and Peter Marwedel. Retargetable Generation of Code Selectors from HDL Processor Models. In European Design & Test Conference (ED & TC) Paris/France, March 1997 [BibTeX][PDF][Abstract]@inproceedings { leupers:1997:edtc,
author = {Leupers, Rainer and Marwedel, Peter},
title = {Retargetable Generation of Code Selectors from HDL Processor Models},
booktitle = {European Design \& Test Conference (ED \& TC)},
year = {1997},
address = {Paris/France},
month = {mar},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1997-edtc.pdf},
confidential = {n},
abstract = {Besides high code quality,a primary issue in embedded code generation is retargetability of code generators. This paper presents techniques for automatic generation of code selectors from externally specified processor models. In contrast to previous work, our retargetable compiler RECORD does not require tool-specific modelling formalisms, but starts from general HDL processor models. From an HDL model, all processor aspects needed for code generation are automatically derived. As demonstrated by experimental results, short turnaround times for retargeting are achieved, which permits to study the HW/SW trade-off between processor architectures and program execution speed.},
} Besides high code quality,a primary issue in embedded code generation is retargetability of code generators. This paper presents techniques for automatic generation of code selectors from externally specified processor models. In contrast to previous work, our retargetable compiler RECORD does not require tool-specific modelling formalisms, but starts from general HDL processor models. From an HDL model, all processor aspects needed for code generation are automatically derived. As demonstrated by experimental results, short turnaround times for retargeting are achieved, which permits to study the HW/SW trade-off between processor architectures and program execution speed.
|
| Peter Marwedel. Code Generation for Core Processors. In Invited embedded tutorial, 34th Design Automation Conference 1997 [BibTeX][PDF][Abstract]@inproceedings { marwedel:1997:dac,
author = {Marwedel, Peter},
title = {Code Generation for Core Processors},
booktitle = {Invited embedded tutorial, 34th Design Automation Conference},
year = {1997},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1997-f_dac.pdf},
confidential = {n},
abstract = {This tutorial responds to the rapidly increasing use of cores in general and of processor cores in particular for implementing systems-on-a-chip. In the first part of this text, we will provide a brief introduction to various cores. Applications can be found in most segments of the embedded systems market. These applications demand for extreme efficiency, and in particular for efficient processor architectures and for efficient embedded software. In the second part of this text, we will show that current compilers do not provide the required efficiency and we will give an overview over new compiler optimization techniques, which aim at making assembly language programming for embedded software obsolete. These new techniques take advantage of the special characteristics of embedded software and embedded architectures. Due to efficiency considerations, processor architectures optimized for application domains or even for particular applications are of interest. This results in a large number of architectures and instruction sets, leading to the requirement for retargeting compilers to those numerous architectures. In the final section of the tutorial, we will present techniques for retargeting compilers to new architectures easily. We will show, how compilers can be generated from descriptions of processors. One of the approaches closes the gap which so far existed between electronic CAD and compiler generation.},
} This tutorial responds to the rapidly increasing use of cores in general and of processor cores in particular for implementing systems-on-a-chip. In the first part of this text, we will provide a brief introduction to various cores. Applications can be found in most segments of the embedded systems market. These applications demand for extreme efficiency, and in particular for efficient processor architectures and for efficient embedded software. In the second part of this text, we will show that current compilers do not provide the required efficiency and we will give an overview over new compiler optimization techniques, which aim at making assembly language programming for embedded software obsolete. These new techniques take advantage of the special characteristics of embedded software and embedded architectures. Due to efficiency considerations, processor architectures optimized for application domains or even for particular applications are of interest. This results in a large number of architectures and instruction sets, leading to the requirement for retargeting compilers to those numerous architectures. In the final section of the tutorial, we will present techniques for retargeting compilers to new architectures easily. We will show, how compilers can be generated from descriptions of processors. One of the approaches closes the gap which so far existed between electronic CAD and compiler generation.
|
| Peter Marwedel. Processor-Core Based Design and Test - Invited Embedded Tutorial. In Asia and South Pacific Design Automation Conference 1998 Automation Tokyo, Japan, 1997 [BibTeX][PDF][Abstract]@inproceedings { marwedel:1997:tut,
author = {Marwedel, Peter},
title = {Processor-Core Based Design and Test - Invited Embedded Tutorial},
booktitle = {Asia and South Pacific Design Automation Conference 1998 Automation},
year = {1997},
address = {Tokyo, Japan},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1997-aspdactut.pdf},
confidential = {n},
abstract = {This tutorial responds to the rapidly increasing use of various cores for implementing systems-on-a-chip. It specifically focusses on processor cores. We will give some examples of cores, including DSP cores and application-specific instruction-set processors (ASIPs). We will mention market trends for these components, and we will touch design procedures, in particular the use compilers. Finally, we will discuss the problem of testing core-based designs. Existing solutions include boundary scan, embedded in-circuit emulation (ICE), the use of processor resources for stimuli/response compaction and self-test programs.},
} This tutorial responds to the rapidly increasing use of various cores for implementing systems-on-a-chip. It specifically focusses on processor cores. We will give some examples of cores, including DSP cores and application-specific instruction-set processors (ASIPs). We will mention market trends for these components, and we will touch design procedures, in particular the use compilers. Finally, we will discuss the problem of testing core-based designs. Existing solutions include boundary scan, embedded in-circuit emulation (ICE), the use of processor resources for stimuli/response compaction and self-test programs.
|
| Birger Landwehr, Peter Marwedel, Ingolf Markhof and Rainer Dömer. Exploiting Isomorphism for Speeding-Up Instance-Binding in an Integrated Scheduling, Allocation and Assignment Approach to Architectural Synthesis. In Conference on Computer Hardware Description Languages and their Applications Toledo, Spain, April 1997 [BibTeX][PDF][Abstract]@inproceedings { landwehr:1997:chdl,
author = {Landwehr, Birger and Marwedel, Peter and Markhof, Ingolf and D\"omer, Rainer},
title = {Exploiting Isomorphism for Speeding-Up Instance-Binding in an Integrated Scheduling, Allocation and Assignment Approach to Architectural Synthesis},
booktitle = {Conference on Computer Hardware Description Languages and their Applications},
year = {1997},
address = {Toledo, Spain},
month = {apr},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1997-chdl.pdf},
confidential = {n},
abstract = {Register-Transfer (RT-) level netlists are said to be isomorphic if they can be made identical by relabeling RT-components. RT-netlists can be generated by architectural synthesis. In order to consider just the essential design decisions, architectural synthesis should consider only a single representative of sets of isomorphic netlists. In this paper, we are using netlist isomorphism for the very first time in architectural synthesis. Furthermore, we describe how an integer-programming (IP-) based synthesis technique can be extended to take advantage of netlist isomorphism.},
} Register-Transfer (RT-) level netlists are said to be isomorphic if they can be made identical by relabeling RT-components. RT-netlists can be generated by architectural synthesis. In order to consider just the essential design decisions, architectural synthesis should consider only a single representative of sets of isomorphic netlists. In this paper, we are using netlist isomorphism for the very first time in architectural synthesis. Furthermore, we describe how an integer-programming (IP-) based synthesis technique can be extended to take advantage of netlist isomorphism.
|
| Rainer Leupers and Peter Marwedel. Retargetable Compilers for Embedded DSPs. In 7th European Multimedia, Microprocessor Systems and Electronic Commerce Conference (EMMSEC) Florence/Italy, November 1997 [BibTeX][PDF][Abstract]@inproceedings { leupers:1997:emmsec,
author = {Leupers, Rainer and Marwedel, Peter},
title = {Retargetable Compilers for Embedded DSPs},
booktitle = {7th European Multimedia, Microprocessor Systems and Electronic Commerce Conference (EMMSEC)},
year = {1997},
address = {Florence/Italy},
month = {nov},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1997-emmsec.pdf},
confidential = {n},
abstract = {Programmable devices are a key technology for the design of embedded systems, such as in the consumer electronics market. Processor cores are used as building blocks for more and more embedded system designs, since they provide a unique combination of features: flexibility and reusability. Processor-based design implies that compilers capable of generating efficient machine code are necessary. However, highly efficient compilers for embedded processors are hardly available. In particular, this holds for digital signal processors (DSPs). This contribution is intended to outline different aspects of DSP compiler technology. First, we cover demands on compilers for embedded DSPs, which are partially in sharp contrast to traditional compiler construction. Secondly, we present recent advances in DSP code optimization techniques, which explore a comparatively large search space in order to achieve high code quality. Finally, we discuss the different approaches to retargetability of compilers, that is, techniques for automatic generation of compilers from processor models.},
} Programmable devices are a key technology for the design of embedded systems, such as in the consumer electronics market. Processor cores are used as building blocks for more and more embedded system designs, since they provide a unique combination of features: flexibility and reusability. Processor-based design implies that compilers capable of generating efficient machine code are necessary. However, highly efficient compilers for embedded processors are hardly available. In particular, this holds for digital signal processors (DSPs). This contribution is intended to outline different aspects of DSP compiler technology. First, we cover demands on compilers for embedded DSPs, which are partially in sharp contrast to traditional compiler construction. Secondly, we present recent advances in DSP code optimization techniques, which explore a comparatively large search space in order to achieve high code quality. Finally, we discuss the different approaches to retargetability of compilers, that is, techniques for automatic generation of compilers from processor models.
|
| Peter Marwedel. Compilers for Embedded Processors. In Invited Embedded Tutorial SASIMI, Osaka, 1997 [BibTeX][PDF][Abstract]@inproceedings { marwedel:1997:sasimi,
author = {Marwedel, Peter},
title = {Compilers for Embedded Processors},
booktitle = {Invited Embedded Tutorial},
year = {1997},
address = {SASIMI, Osaka},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1997-sasimi.pdf},
confidential = {n},
abstract = {This talk responds to the rapidly increasing use of embedded processors for implementing systems. Such processors come in the form of discrete processors as well as in the form of core processors. They are available both from vendors and within system companies. Applications can be found in most segments of the embedded system market, such as automotive electronics and telecommunications. These applications demand for extremely efficient processor architectures, optimized for a certain application domain or even a certain application. Current compiler technology supports these architectures very poorly and has recently been recognized as a major bottleneck for designing systems quickly, efficiently and reliably. A number of recent research projects aim at removing this bottleneck. The talk will briefly discuss the trend towards embedded processors. We will show market trends and examples of recent embedded processors. We will also introduce the terms "application specific instruction-set processors" (ASIPs), "application-specific signal processors" (ASSPs), "soft cores" and "hard cores". We will then present new code optimization approaches taking the special characterstics of embedded processor architectures into account. In particular, we will present new memory allocation and code compaction algorithms. In the final section of the talk, we will present techniques for retargeting compilers to new architectures easily. These techniques are motivated by the need for domain- or application-dependent optimizations of processor architectures. The scope for such optimizations should not be restricted to hardware architectures but has to include the corresponding work on compilers as well. We will show, how compilers can be generated from descriptions of processor architectures. Presented techniques aim at bridging the gap between electronic CAD and compiler generation. },
} This talk responds to the rapidly increasing use of embedded processors for implementing systems. Such processors come in the form of discrete processors as well as in the form of core processors. They are available both from vendors and within system companies. Applications can be found in most segments of the embedded system market, such as automotive electronics and telecommunications. These applications demand for extremely efficient processor architectures, optimized for a certain application domain or even a certain application. Current compiler technology supports these architectures very poorly and has recently been recognized as a major bottleneck for designing systems quickly, efficiently and reliably. A number of recent research projects aim at removing this bottleneck. The talk will briefly discuss the trend towards embedded processors. We will show market trends and examples of recent embedded processors. We will also introduce the terms "application specific instruction-set processors" (ASIPs), "application-specific signal processors" (ASSPs), "soft cores" and "hard cores". We will then present new code optimization approaches taking the special characterstics of embedded processor architectures into account. In particular, we will present new memory allocation and code compaction algorithms. In the final section of the talk, we will present techniques for retargeting compilers to new architectures easily. These techniques are motivated by the need for domain- or application-dependent optimizations of processor architectures. The scope for such optimizations should not be restricted to hardware architectures but has to include the corresponding work on compilers as well. We will show, how compilers can be generated from descriptions of processor architectures. Presented techniques aim at bridging the gap between electronic CAD and compiler generation. |
| Rainer Leupers and Peter Marwedel. Instruction-Set Modelling for ASIP Code Generation. In 9th Int. Conference on VLSI Design Bangalore/India, January 1996 [BibTeX][PDF][Abstract]@inproceedings { leupers:1996:vlsi,
author = {Leupers, Rainer and Marwedel, Peter},
title = {Instruction-Set Modelling for ASIP Code Generation},
booktitle = {9th Int. Conference on VLSI Design},
year = {1996},
address = {Bangalore/India},
month = {jan},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1996-vlsi-design.pdf},
confidential = {n},
abstract = {A main objective in code generation for ASIPs is to develop retargetable compilers in order to permit exploration of different architectural alternatives within short turnaround time. Retargetability requires that the compiler is supplied with a formal description of the target processor. This description is usually transformed into an internal instruction set model, on which the actual code generation operates. In this contribution we analyze the demands on instruction set models for retargetable code generation, and we present a formal instruction set model which meets these demands. Compared to previous work, it covers a broad range of instruction formats and includes a detailed view of inter-instruction restrictions.},
} A main objective in code generation for ASIPs is to develop retargetable compilers in order to permit exploration of different architectural alternatives within short turnaround time. Retargetability requires that the compiler is supplied with a formal description of the target processor. This description is usually transformed into an internal instruction set model, on which the actual code generation operates. In this contribution we analyze the demands on instruction set models for retargetable code generation, and we present a formal instruction set model which meets these demands. Compared to previous work, it covers a broad range of instruction formats and includes a detailed view of inter-instruction restrictions.
|
| Ralf Niemann and Peter Marwedel. Hardware/Software Partitioning using Integer Programming. In Proc. European Design & Test Conference 1996 [BibTeX][PDF][Abstract]@inproceedings { niemann:1996:edtc,
author = {Niemann, Ralf and Marwedel, Peter},
title = {Hardware/Software Partitioning using Integer Programming},
booktitle = {Proc. European Design \\& Test Conference},
year = {1996},
keywords = {hwsw},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1996-edtc.pdf},
confidential = {n},
abstract = {One of the key problems in hardware/software codesign is hardware/software partitioning. This paper describes a new approach to hardware/software partitioning using integer programming (IP). The advantage of using IP is that optimal results are calculated respective to the chosen objective function. The partitioning approach works fully automatic and supports multi-processor systems, interfacing and hardware sharing. In contrast to other approaches where special estimators are used, we use compilation and synthesis tools for cost estimation. The increased time for calculating the cost metrics is compensated by an improved quality of the estimations compared to the results of estimators. Therefore, fewer iteration steps of partitioning will be needed. The paper will show that using integer programming to solve the hardware/software partitioning problem is feasible and leads to promising results.},
} One of the key problems in hardware/software codesign is hardware/software partitioning. This paper describes a new approach to hardware/software partitioning using integer programming (IP). The advantage of using IP is that optimal results are calculated respective to the chosen objective function. The partitioning approach works fully automatic and supports multi-processor systems, interfacing and hardware sharing. In contrast to other approaches where special estimators are used, we use compilation and synthesis tools for cost estimation. The increased time for calculating the cost metrics is compensated by an improved quality of the estimations compared to the results of estimators. Therefore, fewer iteration steps of partitioning will be needed. The paper will show that using integer programming to solve the hardware/software partitioning problem is feasible and leads to promising results.
|
| Rainer Leupers and Peter Marwedel. Algorithms for Address Assignment in DSP Code Generation. In Int. Conf. on Computer-Aided Design (ICCAD) San Jose, November 1996 [BibTeX][PDF][Abstract]@inproceedings { leupers:1996:iccad,
author = {Leupers, Rainer and Marwedel, Peter},
title = {Algorithms for Address Assignment in DSP Code Generation},
booktitle = {Int. Conf. on Computer-Aided Design (ICCAD)},
year = {1996},
address = {San Jose},
month = {nov},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1996-iccad.pdf},
confidential = {n},
abstract = {This paper presents DSP code optimization techniques, which originate from dedicated memory address generation hardware. We define a generic model of DSP address generation units. Based on this model, we present efficient heuristics for computing memory layouts for program variables, which optimize utilization of parallel address generation units. Improvements and generalizations of previous work are described, and the efficacy of the proposed algorithms is demonstrated through experimental evaluation.},
} This paper presents DSP code optimization techniques, which originate from dedicated memory address generation hardware. We define a generic model of DSP address generation units. Based on this model, we present efficient heuristics for computing memory layouts for program variables, which optimize utilization of parallel address generation units. Improvements and generalizations of previous work are described, and the efficacy of the proposed algorithms is demonstrated through experimental evaluation.
|
| Rainer Leupers and Peter Marwedel. Instruction Selection for Embedded DSPs with Complex Instructions. In European Design Automation Conference (EURO-DAC) Geneva/Switzerland, September 1996 [BibTeX][PDF][Abstract]@inproceedings { leupers:1996:eurodac,
author = {Leupers, Rainer and Marwedel, Peter},
title = {Instruction Selection for Embedded DSPs with Complex Instructions},
booktitle = {European Design Automation Conference (EURO-DAC)},
year = {1996},
address = {Geneva/Switzerland},
month = {sep},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1996-eurodac.pdf},
confidential = {n},
abstract = {We address the problem of instruction selection in code generation for embedded digital signal processors. Recent work has shown that this task can be efficiently solved by tree covering with dynamic programming, even in combination with the task of register allocation. However, performing instruction selection by tree covering only does not exploit available instruction-level parallelism, for instance in form of multiply-accumulate instructions or parallel data moves. In this paper we investigate how such complex instructions may affect detection of optimal tree covers, and we present a two-phase scheme for instruction selection which exploits available instruction-level parallelism. At the expense of higher compilation time, this technique may significantly increase the code quality compared to previous work, which is demonstrated for a widespread DSP.},
} We address the problem of instruction selection in code generation for embedded digital signal processors. Recent work has shown that this task can be efficiently solved by tree covering with dynamic programming, even in combination with the task of register allocation. However, performing instruction selection by tree covering only does not exploit available instruction-level parallelism, for instance in form of multiply-accumulate instructions or parallel data moves. In this paper we investigate how such complex instructions may affect detection of optimal tree covers, and we present a two-phase scheme for instruction selection which exploits available instruction-level parallelism. At the expense of higher compilation time, this technique may significantly increase the code quality compared to previous work, which is demonstrated for a widespread DSP.
|
| Rainer Leupers and Peter Marwedel. A BDD-based Frontend for Retargetable Compilers. In Proc. European Design & Test Conference 1995 [BibTeX][PDF][Abstract]@inproceedings { leupers:1995:edtc,
author = {Leupers, Rainer and Marwedel, Peter},
title = {A BDD-based Frontend for Retargetable Compilers},
booktitle = {Proc. European Design \& Test Conference},
year = {1995},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1995-edtc.pdf},
confidential = {n},
abstract = {In this paper we present a unified frontend for retargetable compilers that performs analysis of the target processor model. Our approach bridges the gap between structural and behavioral processor models for retargetable compilation. This is achieved by means of instruction set extraction. The extraction technique is based on a BDD data structure which significantly improves control signal analysis in the target processor compared to previous approaches.},
} In this paper we present a unified frontend for retargetable compilers that performs analysis of the target processor model. Our approach bridges the gap between structural and behavioral processor models for retargetable compilation. This is achieved by means of instruction set extraction. The extraction technique is based on a BDD data structure which significantly improves control signal analysis in the target processor compared to previous approaches.
|
| Rainer Leupers and Peter Marwedel. Time-constrained Code Compaction for DSPs. In Int. Symp. on System Synthesis (ISSS) September 1995 [BibTeX][PDF][Abstract]@inproceedings { leupers:1995:isss,
author = {Leupers, Rainer and Marwedel, Peter},
title = {Time-constrained Code Compaction for DSPs},
booktitle = {Int. Symp. on System Synthesis (ISSS)},
year = {1995},
month = {sep},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1995-isss.pdf},
confidential = {n},
abstract = {DSP algorithms in most cases are subject to hard real-time constraints. In case of programmable DSP processors, meeting those constraints must be ensured by appropriate code generation techniques. For processors offering instruction-level parallelism, the task of code generation includes code compaction. The exact timing behavior of a DSP program is only known after compaction. Therefore, real-time constraints should be taken into account during the compaction phase. While most known DSP code generators rely on rigid heuristics for that phase, this paper proposes a novel approach to local code compaction based on an Integer Programming model, which obeys exact timing constraints. Due to a general problem formulation, the model also obeys encoding restrictions and possible side effects.},
} DSP algorithms in most cases are subject to hard real-time constraints. In case of programmable DSP processors, meeting those constraints must be ensured by appropriate code generation techniques. For processors offering instruction-level parallelism, the task of code generation includes code compaction. The exact timing behavior of a DSP program is only known after compaction. Therefore, real-time constraints should be taken into account during the compaction phase. While most known DSP code generators rely on rigid heuristics for that phase, this paper proposes a novel approach to local code compaction based on an Integer Programming model, which obeys exact timing constraints. Due to a general problem formulation, the model also obeys encoding restrictions and possible side effects.
|
| Rainer Leupers and Peter Marwedel. Using Compilers for Heterogeneous System Design. In Int. Conf. on Parallel Architectures and Compilation Techniques (PACT) June 1995 [BibTeX][PDF][Abstract]@inproceedings { leupers:1995:pact,
author = {Leupers, Rainer and Marwedel, Peter},
title = {Using Compilers for Heterogeneous System Design},
booktitle = {Int. Conf. on Parallel Architectures and Compilation Techniques (PACT)},
year = {1995},
month = {jun},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1995-pact.pdf},
confidential = {n},
abstract = {Heterogeneous systems combine both data and control processing functions. A programmable DSP core forms the central component. The design of such systems establishes a new application of compilers in electronic CAD: In order to meet given real-time constraints and optimize chip area consumption, the DSP core needs to be customized for each application. In turn, this requires compiler support for evaluating different architectural alternatives. This paper discusses the importance of retargetable compilers in heterogeneous system design.},
} Heterogeneous systems combine both data and control processing functions. A programmable DSP core forms the central component. The design of such systems establishes a new application of compilers in electronic CAD: In order to meet given real-time constraints and optimize chip area consumption, the DSP core needs to be customized for each application. In turn, this requires compiler support for evaluating different architectural alternatives. This paper discusses the importance of retargetable compilers in heterogeneous system design.
|
| Ulrich Bieker and Peter Marwedel. Retargetable self-test program generation using constraint logic programming. In Proceedings of the 32nd annual ACM/IEEE Design Automation Conference, pages 605--611 New York, NY, USA, 1995 [BibTeX][PDF][Link]@inproceedings { Bieker:1995:RSP:217474.217597,
author = {Bieker, Ulrich and Marwedel, Peter},
title = {Retargetable self-test program generation using constraint logic programming},
booktitle = {Proceedings of the 32nd annual ACM/IEEE Design Automation Conference},
year = {1995},
series = {DAC '95},
pages = {605--611},
address = {New York, NY, USA},
publisher = {ACM},
url = {http://doi.acm.org/10.1145/217474.217597},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1995-dac.pdf},
confidential = {n},
} |
| Rainer Leupers, Wolfgang Schenk and Peter Marwedel. Retargetable Assembly Code Generation by Bootstrapping. In Proc. 7th International Symposium on High-Level Synthesis 1994 [BibTeX][PDF][Abstract]@inproceedings { leupers:1994:hlss,
author = {Leupers, Rainer and Schenk, Wolfgang and Marwedel, Peter},
title = {Retargetable Assembly Code Generation by Bootstrapping},
booktitle = {Proc. 7th International Symposium on High-Level Synthesis},
year = {1994},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1994-hlss.pdf},
confidential = {n},
abstract = {In a hardware/software codesign environment compilers are needed that map software components of a partitioned system behavioral description onto a programmable processor. Since the processor structure is not static, but can repeatedly change during the design process, the compiler should be retargetable in order to avoid manual compiler adaption for each alternative architecture. A restriction of existing retargetable compilers is that they only generate microcode for the target architecture instead of machine-level code. In this paper we introduce a bootstrapping technique permitting to translate high-level language (HLL) programs into real machine-level code using a retargetable microcode compiler. Retargetability is preserved, permitting to compare different architectural alternatives in a codesign framework within relatively short time.},
} In a hardware/software codesign environment compilers are needed that map software components of a partitioned system behavioral description onto a programmable processor. Since the processor structure is not static, but can repeatedly change during the design process, the compiler should be retargetable in order to avoid manual compiler adaption for each alternative architecture. A restriction of existing retargetable compilers is that they only generate microcode for the target architecture instead of machine-level code. In this paper we introduce a bootstrapping technique permitting to translate high-level language (HLL) programs into real machine-level code using a retargetable microcode compiler. Retargetability is preserved, permitting to compare different architectural alternatives in a codesign framework within relatively short time.
|
| Rainer Leupers, Ralf Niemann and Peter Marwedel. Methods for Retargetable DSP Code Generation. In IEEE Workshop on VLSI Signal Processing 1994 1994 [BibTeX][PDF][Abstract]@inproceedings { leupers:1994:ieee,
author = {Leupers, Rainer and Niemann, Ralf and Marwedel, Peter},
title = {Methods for Retargetable DSP Code Generation},
booktitle = {IEEE Workshop on VLSI Signal Processing 1994},
year = {1994},
keywords = {hwsw},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1994-IEEE.pdf},
confidential = {n},
abstract = {Efficient embedded DSP system design requires methods of hardware/software codesign. In this contribution we focus on software synthesis for partitioned system behavioral descriptions. In previous approaches, this task is performed by compiling the behavioral descriptions onto standard processors using target-specific compilers. It is argued that abandoning this restriction allows for higher degrees of freedom in design space exploration. In turn, this demands for retargetable code generation tools. We present different schemes for DSP code generation using the MSSQ microcode generator. Experiments with industrial applications revealed that retargetable DSP code generation based on structural hardware descriptions is feasible, but there exists a strong dependency between the behavioral description style and the resulting code quality. As a result, necessary features of high-quality retargetable DSP code generators are identified.},
} Efficient embedded DSP system design requires methods of hardware/software codesign. In this contribution we focus on software synthesis for partitioned system behavioral descriptions. In previous approaches, this task is performed by compiling the behavioral descriptions onto standard processors using target-specific compilers. It is argued that abandoning this restriction allows for higher degrees of freedom in design space exploration. In turn, this demands for retargetable code generation tools. We present different schemes for DSP code generation using the MSSQ microcode generator. Experiments with industrial applications revealed that retargetable DSP code generation based on structural hardware descriptions is feasible, but there exists a strong dependency between the behavioral description style and the resulting code quality. As a result, necessary features of high-quality retargetable DSP code generators are identified.
|
| Wolfgang Schenk Rainer Leupers and Peter Marwedel. Microcode Generation for Flexible Parallel Target Architectures. In IFIP Trans.\ A-50: Parallel Architectures and Compilation Techniques (PACT-94) 1994 [BibTeX][PDF][Abstract]@inproceedings { leupers:1994:pact,
author = {Rainer Leupers, Wolfgang Schenk and Marwedel, Peter},
title = {Microcode Generation for Flexible Parallel Target Architectures},
booktitle = {IFIP Trans.\ A-50: Parallel Architectures and Compilation Techniques (PACT-94)},
year = {1994},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1994-pact.pdf},
confidential = {n},
abstract = {Advanced architectural features of microprocessors like instruction level parallelism and pipelined functional hardware units require code generation techniques beyond the scope of traditional compilers. Additionally, recent design styles in the area of digital signal processing pose a strong demand for retargetable compilation. This paper presents an approach to code generation based on netlist descriptions of the target processor. The basic features of the MSSQ microcode compiler are outlined, and novel techniques for handling complex hardware modules and multi-cycle operations are presented.},
} Advanced architectural features of microprocessors like instruction level parallelism and pipelined functional hardware units require code generation techniques beyond the scope of traditional compilers. Additionally, recent design styles in the area of digital signal processing pose a strong demand for retargetable compilation. This paper presents an approach to code generation based on netlist descriptions of the target processor. The basic features of the MSSQ microcode compiler are outlined, and novel techniques for handling complex hardware modules and multi-cycle operations are presented.
|
| Rainer Leupers and Peter Marwedel. Instruction Set Extraction From Programmable Structures.. In Proc. EURO-DAC 1994 1994 [BibTeX][PDF][Abstract]@inproceedings { leupers:1994:eurodac,
author = {Leupers, Rainer and Marwedel, Peter},
title = {Instruction Set Extraction From Programmable Structures.},
booktitle = {Proc. EURO-DAC 1994},
year = {1994},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1994-eurodac-extract.pdf},
confidential = {n},
abstract = {Due to the demand for more design flexibility and design reuse, ASIPs have emerged as a new important design style in the area of DSP systems. In order to obtain efficient hardware/software partitionings within ASIP-based systems, the designer has to be supported by CAD tools that allow frequent re-mapping of algorithms onto variable programmable target structures. This leads to a new class of design tools: retargetable compilers. Considering existing retargetable compilers based on pattern matching, automatic instruction set extraction is identified as a profitable frontend for those compilers. This paper presents concepts and an implementation of an instruction set extractor.},
} Due to the demand for more design flexibility and design reuse, ASIPs have emerged as a new important design style in the area of DSP systems. In order to obtain efficient hardware/software partitionings within ASIP-based systems, the designer has to be supported by CAD tools that allow frequent re-mapping of algorithms onto variable programmable target structures. This leads to a new class of design tools: retargetable compilers. Considering existing retargetable compilers based on pattern matching, automatic instruction set extraction is identified as a profitable frontend for those compilers. This paper presents concepts and an implementation of an instruction set extractor.
|
| Birger Landwehr and Peter Marwedel. Intelligent library component selection and management in an IP-model based high-level synthesis system. In IFIP Workshop on Logic and Architecture Synthesis Grenoble, 1993 [BibTeX]@inproceedings { landwehr:93:ifip,
author = {Landwehr, Birger and Marwedel, Peter},
title = {Intelligent library component selection and management in an IP-model based high-level synthesis system},
booktitle = {IFIP Workshop on Logic and Architecture Synthesis},
year = {1993},
address = {Grenoble},
confidential = {n},
} |
| Peter Marwedel. Tree-Based Mapping of Algorithms to Predefined Structures. In International Conference on Computer Aided Design (ICCAD) 1993 [BibTeX][PDF][Abstract]@inproceedings { marwedel:1993:iccad,
author = {Marwedel, Peter},
title = {Tree-Based Mapping of Algorithms to Predefined Structures},
booktitle = {International Conference on Computer Aided Design (ICCAD)},
year = {1993},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1993-iccad.pdf},
confidential = {n},
abstract = {Due to the need for fast design cycles and low production cost, programmable targets like DSP processors are becoming increasingly popular. Design planning, detailed design as well as updating such designs requires mapping existing algorithms onto these targets. Instead of writing target-specific mappers, we propose using retargetable mappers. The technique reported in this paper is based on pattern matching. Binary code is generated as a result of this matching process. This paper describes the techniques of our mapper MSSV and identifies areas for improvements. As a result, it shows that efficient handling of alternative mappings is crucial for an acceptable performance.},
} Due to the need for fast design cycles and low production cost, programmable targets like DSP processors are becoming increasingly popular. Design planning, detailed design as well as updating such designs requires mapping existing algorithms onto these targets. Instead of writing target-specific mappers, we propose using retargetable mappers. The technique reported in this paper is based on pattern matching. Binary code is generated as a result of this matching process. This paper describes the techniques of our mapper MSSV and identifies areas for improvements. As a result, it shows that efficient handling of alternative mappings is crucial for an acceptable performance.
|
| C. Albrecht, S. Bashford, P. Marwedel, A. Neumann and W. Schenk. The design of the PRIPS microprocessor. In 4th EUROCHIP-Workshop on VLSI Training 1993 [BibTeX][PDF][Abstract]@inproceedings { albrecht:1993:eurochip,
author = {Albrecht, C. and Bashford, S. and Marwedel, P. and Neumann, A. and Schenk, W.},
title = {The design of the PRIPS microprocessor},
booktitle = {4th EUROCHIP-Workshop on VLSI Training},
year = {1993},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1993-eurochips-albrecht.pdf},
confidential = {n},
abstract = {The PRIPS microprocessor was recently designed at the University of Dortmund, Lehrstuhl
Informatik XII. PRIPS is a coprocessor with an RISC-like instruction set. The supported
data types and some of the instructions are oriented towards supporting the execution of
PROLOG programs. The design was performed by a project group consisting of 10 stu-
dents. Such project groups are one of the key features of the computer science curriculum at
Dortmund. The project group was partitioned into three subgroups. The first subgroup was
responsible for everything related to compiling PROLOG programs into machine code. This
group designed the instruction set of PRIPS. the first approach was to consider implementing
the Warren abstract machine (WAM). This approach was rejected because of the size of the
required microcode. Therefore, it was decided, to implement a RISC-like instruction set with
some special instructions for PROLOG. With this approach, PROLOG-programs are first
compiled into WAM-code. WAM instructions are then expanded into RISC instructions. The
first subgroup also analysed the effect of different options for caches on the performance.
The second subgroup designed the register-transfer structure for the given instruction set. To
this end, semantics of the instruction set was described in the hardware description language
MIMOLA. Using the TODOS high-level synthesis system designed at the Universities of
Kiel and Dortmund, an initial RT-structure was generated. Subsequent improvements were
added by using the retargetable code generator MSSQ (see paper by Marwedel at EDAC-
EUROASIC-93 for a description of the design process). The final register transfer structure
contains a register file of 32 registers of 32 bits and an ALU with very sophisticated concurrent
tag checking modes. In order to achieve maximum flexibility, it was decided to implement an
on-chip loadable microstore. In order to improve testability of the chip, PRIPS uses external
clock generation and a scan-path design for the controller.
The third group entered the PRIPS design into a commercial Cadence EDGE CAD database.
Due to the problems with the EDIF interface, the design was entered using schematics entry.
Final layout was also obtained with EDGE. Several iterations were required to meet design
constraints. The final chip size is 12 by 8 mm for the ES2 1.5μm CMOS process.
PRIPS has been submitted to EUROCHIP for fabrication. After some delay, caused by
undocumented features of format converters, 30 samples were received in February, 1993.
The setup used for testing basically consists of an Hewlett Packard 16500A tester, which is
linked to a Sun workstation and programmed using the TSSI software package. First results
indicate that some of the chips are working. However, detailed results are not available yet.},
} The PRIPS microprocessor was recently designed at the University of Dortmund, Lehrstuhl
Informatik XII. PRIPS is a coprocessor with an RISC-like instruction set. The supported
data types and some of the instructions are oriented towards supporting the execution of
PROLOG programs. The design was performed by a project group consisting of 10 stu-
dents. Such project groups are one of the key features of the computer science curriculum at
Dortmund. The project group was partitioned into three subgroups. The first subgroup was
responsible for everything related to compiling PROLOG programs into machine code. This
group designed the instruction set of PRIPS. the first approach was to consider implementing
the Warren abstract machine (WAM). This approach was rejected because of the size of the
required microcode. Therefore, it was decided, to implement a RISC-like instruction set with
some special instructions for PROLOG. With this approach, PROLOG-programs are first
compiled into WAM-code. WAM instructions are then expanded into RISC instructions. The
first subgroup also analysed the effect of different options for caches on the performance.
The second subgroup designed the register-transfer structure for the given instruction set. To
this end, semantics of the instruction set was described in the hardware description language
MIMOLA. Using the TODOS high-level synthesis system designed at the Universities of
Kiel and Dortmund, an initial RT-structure was generated. Subsequent improvements were
added by using the retargetable code generator MSSQ (see paper by Marwedel at EDAC-
EUROASIC-93 for a description of the design process). The final register transfer structure
contains a register file of 32 registers of 32 bits and an ALU with very sophisticated concurrent
tag checking modes. In order to achieve maximum flexibility, it was decided to implement an
on-chip loadable microstore. In order to improve testability of the chip, PRIPS uses external
clock generation and a scan-path design for the controller.
The third group entered the PRIPS design into a commercial Cadence EDGE CAD database.
Due to the problems with the EDIF interface, the design was entered using schematics entry.
Final layout was also obtained with EDGE. Several iterations were required to meet design
constraints. The final chip size is 12 by 8 mm for the ES2 1.5μm CMOS process.
PRIPS has been submitted to EUROCHIP for fabrication. After some delay, caused by
undocumented features of format converters, 30 samples were received in February, 1993.
The setup used for testing basically consists of an Hewlett Packard 16500A tester, which is
linked to a Sun workstation and programmed using the TSSI software package. First results
indicate that some of the chips are working. However, detailed results are not available yet.
|
| Peter Marwedel. Matching System and Component Behaviour in MIMOLA Synthesis Tools.. In Proc. European Design Automation Conference (EDAC) 1990 [BibTeX][PDF][Abstract]@inproceedings { marwedel:1990:edac,
author = {Marwedel, Peter},
title = {Matching System and Component Behaviour in MIMOLA Synthesis Tools.},
booktitle = {Proc. European Design Automation Conference (EDAC)},
year = {1990},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1990-edac.pdf},
confidential = {n},
abstract = {This paper discusses the selection of available components during high-level synthesis. We stress the importance of describing the behaviour of available components in some language which is readable for the designer. This behaviour is internally represented by implications. This concept is the key for formal reasoning about the component's capabilities. Alternative functions and sequential and concurrent cooperation of components can be easily described.},
} This paper discusses the selection of available components during high-level synthesis. We stress the importance of describing the behaviour of available components in some language which is readable for the designer. This behaviour is internally represented by implications. This concept is the key for formal reasoning about the component's capabilities. Alternative functions and sequential and concurrent cooperation of components can be easily described.
|
| Lothar Nowak and Peter Marwedel. Verification of Hardware Descriptions by Retargetable Code Generation.. In Proceedings of the 26th Design Automation Conference (DAC) 1989 [BibTeX][PDF][Abstract]@inproceedings { nowak:1989:dac,
author = {Nowak, Lothar and Marwedel, Peter},
title = {Verification of Hardware Descriptions by Retargetable Code Generation.},
booktitle = {Proceedings of the 26th Design Automation Conference (DAC)},
year = {1989},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1989-dac.pdf},
confidential = {n},
abstract = {This paper proposes a new method for hardware verification. The basic idea is the application of a retargetable compiler as verification tool. A retargetable compiler is able to compile programs into the machine code of a specified hardware (target). If the program is the complete behavioural specification of the target, the compiler can be used to verify that a properly programmed structure implements the behaviour. Methods, algorithms and applications of an existing retargetable compiler are described.},
} This paper proposes a new method for hardware verification. The basic idea is the application of a retargetable compiler as verification tool. A retargetable compiler is able to compile programs into the machine code of a specified hardware (target). If the program is the complete behavioural specification of the target, the compiler can be used to verify that a properly programmed structure implements the behaviour. Methods, algorithms and applications of an existing retargetable compiler are described.
|
| M. Balakrishnan and Peter Marwedel. Integrated Scheduling and Binding: A Synthesis Approach for Design Space Exploration. In Proceedings of the 26th Design Automation Conference (DAC) 1989 [BibTeX][PDF][Abstract]@inproceedings { balakrishnan:1989:dac,
author = {Balakrishnan, M. and Marwedel, Peter},
title = {Integrated Scheduling and Binding: A Synthesis Approach for Design Space Exploration},
booktitle = {Proceedings of the 26th Design Automation Conference (DAC)},
year = {1989},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1989-dac.pdf},
confidential = {n},
abstract = {Synthesis of digital systems, involves a number of tasks ranging from scheduling to generating interconnections. The interrelationship between these tasks implies that good designs can only be generated by considering the overall impact of a design decision. The approach presented in this paper provides a framework for integrating scheduling decisions with binding decisions. The methodology supports allocation of a wider mix of operator modules and covers the design space more effectively. The process itself can be described as incremental synthesis and is thus wellsuited for applications involving partial pre-synthesized structures.},
} Synthesis of digital systems, involves a number of tasks ranging from scheduling to generating interconnections. The interrelationship between these tasks implies that good designs can only be generated by considering the overall impact of a design decision. The approach presented in this paper provides a framework for integrating scheduling decisions with binding decisions. The methodology supports allocation of a wider mix of operator modules and covers the design space more effectively. The process itself can be described as incremental synthesis and is thus wellsuited for applications involving partial pre-synthesized structures.
|
| Lothar Nowak and Peter Marwedel. A retargetable microcode-compiler and its application in design verification and architecture evaluation (in German: Ein retargierbarer Mikrocode-Compiler und seine Anwendung in Entwurfsverifikation und Architekturbewertung). In 18. GI Jahrestagung, pages 233-245 Hamburg, 1988, OCR errors are possible [BibTeX][PDF][Abstract]@inproceedings { nowak:1988:gi,
author = {Nowak, Lothar and Marwedel, Peter},
title = {A retargetable microcode-compiler and its application in design verification and architecture evaluation (in German: Ein retargierbarer Mikrocode-Compiler und seine Anwendung in Entwurfsverifikation und Architekturbewertung)},
booktitle = {18. GI Jahrestagung},
year = {1988},
pages = {233-245},
address = {Hamburg},
note = {OCR errors are possible},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1988-GI-Tagung.pdf},
confidential = {n},
abstract = {Die vorliegende Arbeit beschreibt einen retargierbaren Compiler. Ein solcher Compiler ist in der Lage f\"ur verschiedenen Zielmaschinen (Targets) Maschinencode zu erzeugen. Einsatzgebiete dieses Compilers sind vor allem: die Codeerzeugung f\"ur anwendungsspezifische Spezialprozessoren (speziell auch f\"ur Maschinen gro\"ser Befehlswortbreite), die Verifikation einer manuell erzeugten Hardwarestruktur gegen\"uber einer Verhaltensspezifikation und der Vergleich verschiedener Architekturkonzepte untereinander. Wesentliche Vorteile des Compilers ergeben sich aus der Tatsache, da\"s bereits dem Hardware-Designer eine Compiler-Unterst\"utzung zur Verf\"ugung steht. Schnittstellen-Probleme zwischen Compiler- und Hardware-Entwicklung entfallen, da f\"ur beide die gleiche Maschinenbeschreibung benutzt wird. So k\"onnen beim Architekturentwurf verschiedene Konzepte hinsichtlich Laufzeit und Codedichte l\"angerer Anwendungsprogramme verglichen werden.},
} Die vorliegende Arbeit beschreibt einen retargierbaren Compiler. Ein solcher Compiler ist in der Lage für verschiedenen Zielmaschinen (Targets) Maschinencode zu erzeugen. Einsatzgebiete dieses Compilers sind vor allem: die Codeerzeugung für anwendungsspezifische Spezialprozessoren (speziell auch für Maschinen großer Befehlswortbreite), die Verifikation einer manuell erzeugten Hardwarestruktur gegenüber einer Verhaltensspezifikation und der Vergleich verschiedener Architekturkonzepte untereinander. Wesentliche Vorteile des Compilers ergeben sich aus der Tatsache, daß bereits dem Hardware-Designer eine Compiler-Unterstützung zur Verfügung steht. Schnittstellen-Probleme zwischen Compiler- und Hardware-Entwicklung entfallen, da für beide die gleiche Maschinenbeschreibung benutzt wird. So können beim Architekturentwurf verschiedene Konzepte hinsichtlich Laufzeit und Codedichte längerer Anwendungsprogramme verglichen werden.
|
| Peter Marwedel. A New Synthesis Algorithm for the MIMOLA Software System. In Proceedings of the 23th Design Automation Conference (DAC) June 1986 [BibTeX][PDF][Abstract]@inproceedings { marwedel:1986:dac,
author = {Marwedel, Peter},
title = {A New Synthesis Algorithm for the MIMOLA Software System},
booktitle = {Proceedings of the 23th Design Automation Conference (DAC)},
year = {1986},
month = {jun},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1986-dac.pdf},
confidential = {n},
abstract = {The MIMOLA software system is a system for the design of digital processors. The system includes subsystems for retargetable microcode generation, automatic generation of self-test programs and a synthesis subsystem. This paper describes the synthesis part of the system, which accepts a PASCAL-like, high-level program as specification and produces a register transfer structure. Because of the complexity of this design process, a set of sub-problems is identified and algorithms for their solution are indicated. These algorithms include a flexible statement decomposition, statement scheduling, register assignment, module selection and optimizations of interconnections and instruction word length.},
} The MIMOLA software system is a system for the design of digital processors. The system includes subsystems for retargetable microcode generation, automatic generation of self-test programs and a synthesis subsystem. This paper describes the synthesis part of the system, which accepts a PASCAL-like, high-level program as specification and produces a register transfer structure. Because of the complexity of this design process, a set of sub-problems is identified and algorithms for their solution are indicated. These algorithms include a flexible statement decomposition, statement scheduling, register assignment, module selection and optimizations of interconnections and instruction word length.
|
| Peter Marwedel. The MIMOLA Design System: Tools for the Design of Digital Processors . In Proceedings of the 21th Design Automation Conference (DAC) 1984 [BibTeX][PDF][Abstract]@inproceedings { marwedel:1984:dac,
author = {Marwedel, Peter},
title = {The MIMOLA Design System: Tools for the Design of Digital Processors },
booktitle = {Proceedings of the 21th Design Automation Conference (DAC)},
year = {1984},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1984-dac.pdf},
confidential = {n},
abstract = {The MIMOLA design method is a method for the design of digital processors from a very highlevel bevavioral specification. A key feature of this method is the synthesis of a processor from a description of programs which are expected to be typical for the applications of that processor. Design cycles, in which the designer tries to improve automatically generated hardware structures, are supported by a retargetable microcode generator and by an utilization and performance analyzer. This paper describes the design method, available software tools and some applications.},
} The MIMOLA design method is a method for the design of digital processors from a very highlevel bevavioral specification. A key feature of this method is the synthesis of a processor from a description of programs which are expected to be typical for the applications of that processor. Design cycles, in which the designer tries to improve automatically generated hardware structures, are supported by a retargetable microcode generator and by an utilization and performance analyzer. This paper describes the design method, available software tools and some applications.
|
| Peter Marwedel. The Integrated Design of Computer Systems with MIMOLA. In Proc. 12th annual Gi-conf., Informatik Fachberichte #57 June 1982 [BibTeX][PDF][Abstract]@inproceedings { marwedel:1982:gi,
author = {Marwedel, Peter},
title = {The Integrated Design of Computer Systems with MIMOLA},
booktitle = {Proc. 12th annual Gi-conf., Informatik Fachberichte #57},
year = {1982},
month = {jun},
publisher = {Springer},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1982-gi-Vol57.pdf},
confidential = {n},
abstract = {A design method and the corresponding CAD tools for the design of digital computers from a very high-level functional specification is presented. Computers are specified by a set of application programs, for which they shall be optimized, and a set of boundary conditions for the type and number of hardware resources. The paper describes CAD tools, which synthesize architectures from the specification, evaluate their cost and performance and generate microcode. Several design iterations, in which the designer plays an active role, are used to optimize the design. Two applications of the design method are included in the paper.},
} A design method and the corresponding CAD tools for the design of digital computers from a very high-level functional specification is presented. Computers are specified by a set of application programs, for which they shall be optimized, and a set of boundary conditions for the type and number of hardware resources. The paper describes CAD tools, which synthesize architectures from the specification, evaluate their cost and performance and generate microcode. Several design iterations, in which the designer plays an active role, are used to optimize the design. Two applications of the design method are included in the paper.
|
| Peter Marwedel. The MIMOLA Design System: Detailed Description of the Software System.. In Proceedings of the 16th Design Automation Conference (DAC) 1979 [BibTeX][PDF][Abstract]@inproceedings { marwedel:1979:dac,
author = {Marwedel, Peter},
title = {The MIMOLA Design System: Detailed Description of the Software System.},
booktitle = {Proceedings of the 16th Design Automation Conference (DAC)},
year = {1979},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1979-dac.pdf},
confidential = {n},
abstract = {A software system is described which aids in the design of digital processors with a topdown method. It takes the design language MIMOLA as its input. Input may be either a high level functional or a structural description of the hardware. The output is a description of the hardware on the block level. In the case of a high level functional input, the output contains a listing of the hardware which is necessary to execute the input together with utilization factors of the hardware units. The amount of creatable hardware may be limited in a declaration. If there is not enough hardware to execute the input statements in parallel, the necessary intermediate steps are inserted by the system and a corresponding functional description is presented.},
} A software system is described which aids in the design of digital processors with a topdown method. It takes the design language MIMOLA as its input. Input may be either a high level functional or a structural description of the hardware. The output is a description of the hardware on the block level. In the case of a high level functional input, the output contains a listing of the hardware which is necessary to execute the input together with utilization factors of the hardware units. The amount of creatable hardware may be limited in a declaration. If there is not enough hardware to execute the input statements in parallel, the necessary intermediate steps are inserted by the system and a corresponding functional description is presented.
|