| Helena Kotthaus, Lea Schönberger, Andreas Lang, Jian-Jia Chen and Peter Marwedel. Can Flexible Multi-Core Scheduling Help to Execute Machine Learning Algorithms Resource-Efficiently?. In 22nd International Workshop on Software and Compilers for Embedded Systems, pages 59--62 2019 [BibTeX][Link]@inproceedings { kotthaus/2019b,
author = {Kotthaus, Helena and Sch\"onberger, Lea and Lang, Andreas and Chen, Jian-Jia and Marwedel, Peter},
title = {Can Flexible Multi-Core Scheduling Help to Execute Machine Learning Algorithms Resource-Efficiently?},
booktitle = {22nd International Workshop on Software and Compilers for Embedded Systems},
year = {2019},
series = {SCOPES '19},
pages = {59--62},
publisher = {ACM},
url = {https://dl.acm.org/citation.cfm?id=3323986},
keywords = {Lea},
confidential = {n},
} |
| Constantin Timm, Jens Schmutzler, Peter Marwedel and Christian Wietfeld. Dynamic Web Service Orchestration applied to the Device Profile for Web Services in Hierarchical Networks. In COMSWARE '09: Proceedings of the Fourth International ICST Conference on COMmunication System softWAre and middlewaRE, pages 1 - 6 Dublin, Ireland, 06 2009 [BibTeX][Abstract]@inproceedings { 2009Timm,
author = {Timm, Constantin and Schmutzler, Jens and Marwedel, Peter and Wietfeld, Christian},
title = {Dynamic Web Service Orchestration applied to the Device Profile for Web Services in Hierarchical Networks},
booktitle = {COMSWARE '09: Proceedings of the Fourth International ICST Conference on COMmunication System softWAre and middlewaRE},
year = {2009},
pages = {1 - 6},
address = {Dublin, Ireland},
month = {06},
confidential = {n},
abstract = {Based on the idea of Service Oriented Architectures (SOA), Web Services paved the way for open and flexible interac- tion between heterogeneous systems with a loose coupling between service endpoints. The Device Profile for Web Ser- vices (DPWS) implements a subset of WS-* specifications in order to make the advantages of the Web Service archi- tecture available to a growing embedded systems market. In this paper we are proposing a service orchestration mecha- nism applied to services on top of a DPWS-based middle- ware. The approach is complementary to the rather complex and resource intensive Web Service Business Process Execu- tion Language (WS-BPEL) and focuses on service orchestra- tion on resource constrained devices deployed in hierarchi- cal network topologies. We validate our service orchestra- tion concept through its resource consumption and illustrate its seamless integration into the service development cycle based on the underlying DPWS-compliant middleware.},
} Based on the idea of Service Oriented Architectures (SOA), Web Services paved the way for open and flexible interac- tion between heterogeneous systems with a loose coupling between service endpoints. The Device Profile for Web Ser- vices (DPWS) implements a subset of WS-* specifications in order to make the advantages of the Web Service archi- tecture available to a growing embedded systems market. In this paper we are proposing a service orchestration mecha- nism applied to services on top of a DPWS-based middle- ware. The approach is complementary to the rather complex and resource intensive Web Service Business Process Execu- tion Language (WS-BPEL) and focuses on service orchestra- tion on resource constrained devices deployed in hierarchi- cal network topologies. We validate our service orchestra- tion concept through its resource consumption and illustrate its seamless integration into the service development cycle based on the underlying DPWS-compliant middleware.
|
| Sascha Plazar, Paul Lokuciejewski and Peter Marwedel. WCET-aware Software Based Cache Partitioning for Multi-Task Real-Time Systems. In The 9th International Workshop on Worst-Case Execution Time Analysis (WCET), pages 78-88 Dublin / Ireland, June 2009 [BibTeX][PDF][Abstract]@inproceedings { plazar:09:wcet,
author = {Plazar, Sascha and Lokuciejewski, Paul and Marwedel, Peter},
title = {WCET-aware Software Based Cache Partitioning for Multi-Task Real-Time Systems},
booktitle = {The 9th International Workshop on Worst-Case Execution Time Analysis (WCET)},
year = {2009},
pages = {78-88},
address = {Dublin / Ireland},
month = {jun},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2009-wcet.pdf},
confidential = {n},
abstract = {Caches are a source of unpredictability since it is very difficult to predict if a memory access results in a cache hit or miss. In systems running multiple tasks steered by a preempting scheduler, it is even impossible to determine the cache behavior since interrupt-driven schedulers lead to unknown points of time for context switches. Partitioned caches are already used in multi-task environments to increase the cache hit ratio by avoiding mutual eviction of tasks from the cache. For real-time systems, the upper bound of the execution time is one of the most important metrics, called the Worst-Case Execution Time (WCET). In this paper, we use partitioning of instruction caches as a technique to achieve tighter WCET estimations since tasks can not be evicted from their partition by other tasks. We propose a novel WCET-aware algorithm, which determines the optimal partition size for each task with focus on decreasing the system's WCET for a given set of possible partition sizes. Employing this algorithm, we are able to decrease the WCET depending on the number of tasks in a set by up to 34\%. On average, reductions between 12\% and 19\% can be achieved.},
} Caches are a source of unpredictability since it is very difficult to predict if a memory access results in a cache hit or miss. In systems running multiple tasks steered by a preempting scheduler, it is even impossible to determine the cache behavior since interrupt-driven schedulers lead to unknown points of time for context switches. Partitioned caches are already used in multi-task environments to increase the cache hit ratio by avoiding mutual eviction of tasks from the cache. For real-time systems, the upper bound of the execution time is one of the most important metrics, called the Worst-Case Execution Time (WCET). In this paper, we use partitioning of instruction caches as a technique to achieve tighter WCET estimations since tasks can not be evicted from their partition by other tasks. We propose a novel WCET-aware algorithm, which determines the optimal partition size for each task with focus on decreasing the system's WCET for a given set of possible partition sizes. Employing this algorithm, we are able to decrease the WCET depending on the number of tasks in a set by up to 34%. On average, reductions between 12% and 19% can be achieved.
|
| G. Schuenemann, P. Hartmann, D. Schirmer, P. Towalski, T. Weis, K. Wille and P. Marwedel. An FPGA Based Data Acquisition System for a fast Orbit Feedback at DELTA. In 9th European Workshop on Beam Diagnostics and Instrumentation for Particle Accelerators Basel / Switzerland, May 2009 [BibTeX][PDF][Abstract]@inproceedings { marwedel:09:dipac,
author = {Schuenemann, G. and Hartmann, P. and Schirmer, D. and Towalski, P. and Weis, T. and Wille, K. and Marwedel, P.},
title = {An FPGA Based Data Acquisition System for a fast Orbit Feedback at DELTA},
booktitle = {9th European Workshop on Beam Diagnostics and Instrumentation for Particle Accelerators},
year = {2009},
address = {Basel / Switzerland},
month = {may},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2009-dipac.pdf},
confidential = {n},
abstract = {The demand for beam orbit stability for frequencies up to 1kHz resulted in the need for a fast orbit position data acquisition system at DELTA. The measurement frequency was decided to be 10kHz which results in a good margin for 1kHz corrections. It is based on a Xilinx University Program Virtex-II Pro Development System in conjunction with an inhouse developed Analog-Digital Converter board, featuring two Analog Devices AD974 chips. An inhouse developed software written in VHDL manages measurement and data pre-processing. A communication controller has been adopted from the Diamond Light Source and is used as communication instance. The communication controller is versatile in its application. The data distribution between two or more of the developed measuring systems is possible. This includes data distribution with other systems utilizing the communication controller, e.g. the Libera beam diagnostic system1. To enhance its measuring capabilities one of the two onboard PowerPC cores is running a Linux kernel. A kernel module, capable of receiving the measurement data from the Field Programmable Gateway Array (FPGA) measurement core, was implemented , allowing for advanced data processing and distribution options. The paper presents the design of the system, the used methods and successful results of the first beam measurements.},
} The demand for beam orbit stability for frequencies up to 1kHz resulted in the need for a fast orbit position data acquisition system at DELTA. The measurement frequency was decided to be 10kHz which results in a good margin for 1kHz corrections. It is based on a Xilinx University Program Virtex-II Pro Development System in conjunction with an inhouse developed Analog-Digital Converter board, featuring two Analog Devices AD974 chips. An inhouse developed software written in VHDL manages measurement and data pre-processing. A communication controller has been adopted from the Diamond Light Source and is used as communication instance. The communication controller is versatile in its application. The data distribution between two or more of the developed measuring systems is possible. This includes data distribution with other systems utilizing the communication controller, e.g. the Libera beam diagnostic system1. To enhance its measuring capabilities one of the two onboard PowerPC cores is running a Linux kernel. A kernel module, capable of receiving the measurement data from the Field Programmable Gateway Array (FPGA) measurement core, was implemented , allowing for advanced data processing and distribution options. The paper presents the design of the system, the used methods and successful results of the first beam measurements.
|
| Paul Lokuciejewski, Daniel Cordes, Heiko Falk and Peter Marwedel. A Fast and Precise Static Loop Analysis based on Abstract Interpretation, Program Slicing and Polytope Models. In International Symposium on Code Generation and Optimization (CGO), pages 136-146 Seattle / USA, March 2009 [BibTeX][PDF][Abstract]@inproceedings { lokuciejewski:09:cgo,
author = {Lokuciejewski, Paul and Cordes, Daniel and Falk, Heiko and Marwedel, Peter},
title = {A Fast and Precise Static Loop Analysis based on Abstract Interpretation, Program Slicing and Polytope Models},
booktitle = {International Symposium on Code Generation and Optimization (CGO)},
year = {2009},
pages = {136-146},
address = {Seattle / USA},
month = {mar},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2009-cgo.pdf},
confidential = {n},
abstract = {A static loop analysis is a program analysis computing loop iteration counts. This information is crucial for different fields of applications. In the domain of compilers, the knowledge about loop iterations can be exploited for aggressive loop optimizations like Loop Unrolling. A loop analyzer also provides static information about code execution frequencies which can assist feedback-directed optimizations. Another prominent application is the static worst-case execution time (WCET) analysis which relies on a safe approximation of loop iteration counts. In this paper, we propose a framework for a static loop analysis based on Abstract Interpretation, a theory of a sound approximation of program semantics. To accelerate the analysis, we preprocess the analyzed code using Program Slicing, a technique that removes statements irrelevant for the loop analysis. In addition, we introduce a novel polytope-based loop evaluation that further significantly reduces the analysis time. The efficiency of our loop analyzer is evaluated on a large number of benchmarks. Results show that 99\% of the considered loops could be successfully analyzed in an acceptable amount of time. This study points out that our methodology is best suited for real-world problems.},
} A static loop analysis is a program analysis computing loop iteration counts. This information is crucial for different fields of applications. In the domain of compilers, the knowledge about loop iterations can be exploited for aggressive loop optimizations like Loop Unrolling. A loop analyzer also provides static information about code execution frequencies which can assist feedback-directed optimizations. Another prominent application is the static worst-case execution time (WCET) analysis which relies on a safe approximation of loop iteration counts. In this paper, we propose a framework for a static loop analysis based on Abstract Interpretation, a theory of a sound approximation of program semantics. To accelerate the analysis, we preprocess the analyzed code using Program Slicing, a technique that removes statements irrelevant for the loop analysis. In addition, we introduce a novel polytope-based loop evaluation that further significantly reduces the analysis time. The efficiency of our loop analyzer is evaluated on a large number of benchmarks. Results show that 99% of the considered loops could be successfully analyzed in an acceptable amount of time. This study points out that our methodology is best suited for real-world problems.
|
| Paul Lokuciejewski and Peter Marwedel. Combining Worst-Case Timing Models, Loop Unrolling, and Static Loop Analysis for WCET Minimization. In The 21st Euromicro Conference on Real-Time Systems (ECRTS), pages 35-44 Dublin / Ireland, July 2009 [BibTeX][PDF][Abstract]@inproceedings { lokuciejewski:09:ecrts,
author = {Lokuciejewski, Paul and Marwedel, Peter},
title = {Combining Worst-Case Timing Models, Loop Unrolling, and Static Loop Analysis for WCET Minimization},
booktitle = {The 21st Euromicro Conference on Real-Time Systems (ECRTS)},
year = {2009},
pages = {35-44},
address = {Dublin / Ireland},
month = {jul},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2009-ecrts.pdf},
confidential = {n},
abstract = {Program loops are notorious for their optimization potential on modern high-performance architectures. Compilers aim at their aggressive transformation to achieve large improvements of the program performance. In particular, the optimization loop unrolling has shown in the past decades to be highly effective achieving significant increases of the average-case performance. In this paper, we present loop unrolling that is tailored towards real-time systems. Our novel optimization is driven by worst-case execution time (WCET) information to effectively minimize the program's worst-case behavior. To exploit maximal optimization potential, the determination of a suitable unrolling factor is based on precise loop iteration counts provided by a static loop analysis. In addition, our heuristics avoid adverse effects of unrolling which result from instruction cache overflows and the generation of additional spill code. Results on 45 real-life benchmarks demonstrate that aggressive loop unrolling can yield WCET reductions of up to 13.7\% over simple, naive approaches employed by many production compilers.},
} Program loops are notorious for their optimization potential on modern high-performance architectures. Compilers aim at their aggressive transformation to achieve large improvements of the program performance. In particular, the optimization loop unrolling has shown in the past decades to be highly effective achieving significant increases of the average-case performance. In this paper, we present loop unrolling that is tailored towards real-time systems. Our novel optimization is driven by worst-case execution time (WCET) information to effectively minimize the program's worst-case behavior. To exploit maximal optimization potential, the determination of a suitable unrolling factor is based on precise loop iteration counts provided by a static loop analysis. In addition, our heuristics avoid adverse effects of unrolling which result from instruction cache overflows and the generation of additional spill code. Results on 45 real-life benchmarks demonstrate that aggressive loop unrolling can yield WCET reductions of up to 13.7% over simple, naive approaches employed by many production compilers.
|
| Paul Lokuciejewski, Fatih Gedikli, Peter Marwedel and Katharina Morik. Automatic WCET Reduction by Machine Learning Based Heuristics for Function Inlining. In Proceedings of the 3rd Workshop on Statistical and Machine Learning Approaches to Architectures and Compilation (SMART), pages 1-15 Paphos / Cyprus, January 2009 [BibTeX][PDF][Abstract]@inproceedings { lokuciejewski:09:smart,
author = {Lokuciejewski, Paul and Gedikli, Fatih and Marwedel, Peter and Morik, Katharina},
title = {Automatic WCET Reduction by Machine Learning Based Heuristics for Function Inlining},
booktitle = {Proceedings of the 3rd Workshop on Statistical and Machine Learning Approaches to Architectures and Compilation (SMART)},
year = {2009},
pages = {1-15},
address = {Paphos / Cyprus},
month = {jan},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2009-smart.pdf},
confidential = {n},
abstract = {The application of machine learning techniques in compiler frameworks has become a challenging research area. Learning algorithms are exploited for an automatic generation of optimization heuristics which often outperform hand-crafted models. Moreover, these automatic approaches can effectively tune the compilers' heuristics after larger changes in the optimization sequence or they can be leveraged to tailor heuristics towards a particular architectural model. Previous works focussed on a reduction of the average-case performance. In this paper, learning approaches are studied in the context of an automatic minimization of the worst-case execution time (WCET) which is the upper bound of the program's maximum execution time. We show that explicitly taking the new timing model into account allows the construction of compiler heuristics that effectively reduce the WCET. This is demonstrated for the well-known optimization function inlining. Our WCET-driven inlining heuristics based on a fast classifier called random forests outperform standard heuristics by up to 9.1% on average in terms of the WCET reduction. Moreover, we point out that our classifier is highly accurate with a prediction rate for inlining candidates of 84.0%.},
} The application of machine learning techniques in compiler frameworks has become a challenging research area. Learning algorithms are exploited for an automatic generation of optimization heuristics which often outperform hand-crafted models. Moreover, these automatic approaches can effectively tune the compilers' heuristics after larger changes in the optimization sequence or they can be leveraged to tailor heuristics towards a particular architectural model. Previous works focussed on a reduction of the average-case performance. In this paper, learning approaches are studied in the context of an automatic minimization of the worst-case execution time (WCET) which is the upper bound of the program's maximum execution time. We show that explicitly taking the new timing model into account allows the construction of compiler heuristics that effectively reduce the WCET. This is demonstrated for the well-known optimization function inlining. Our WCET-driven inlining heuristics based on a fast classifier called random forests outperform standard heuristics by up to 9.1% on average in terms of the WCET reduction. Moreover, we point out that our classifier is highly accurate with a prediction rate for inlining candidates of 84.0%.
|
| Paul Lokuciejewski, Fatih Gedikli and Peter Marwedel. Accelerating WCET-driven Optimizations by the Invariant Path Paradigm - a Case Study of Loop Unswitching. In The 12th International Workshop on Software & Compilers for Embedded Systems (SCOPES), pages 11-20 Nice / France, April 2009 [BibTeX][PDF][Abstract]@inproceedings { lokuciejewski:09:scopes,
author = {Lokuciejewski, Paul and Gedikli, Fatih and Marwedel, Peter},
title = {Accelerating WCET-driven Optimizations by the Invariant Path Paradigm - a Case Study of Loop Unswitching},
booktitle = {The 12th International Workshop on Software \& Compilers for Embedded Systems (SCOPES)},
year = {2009},
pages = {11-20},
address = {Nice / France},
month = {apr},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2009-scopes.pdf},
confidential = {n},
abstract = {The worst-case execution time (WCET) being the upper bound of the maximum execution time corresponds to the longest path through the program's control flow graph. Its reduction is the objective of a WCET optimization. Unlike average-case execution time compiler optimizations which consider a static (most frequently executed) path, the longest path is variable since its optimization might result in another path becoming the effective longest path. To keep path information valid, WCET optimizations typically perform a time-consuming static WCET analysis after each code modification to ensure that subsequent optimization steps operate on the critical path. However, a code modification does not always lead to a path switch, making many WCET analyses superfluous. To cope with this problem, we propose a new paradigm called Invariant Path which eliminates the pessimism by indicating whether a path update is mandatory. To demonstrate the paradigm's practical use, we developed a novel optimization called WCET-driven Loop Unswitching which exploits the Invariant Path information. In a case study, our optimization reduced the WCET of real-world benchmarks by up to 18.3\%, while exploiting the Invariant Path paradigm led to a reduction of the optimization time by 57.5\% on average.},
} The worst-case execution time (WCET) being the upper bound of the maximum execution time corresponds to the longest path through the program's control flow graph. Its reduction is the objective of a WCET optimization. Unlike average-case execution time compiler optimizations which consider a static (most frequently executed) path, the longest path is variable since its optimization might result in another path becoming the effective longest path. To keep path information valid, WCET optimizations typically perform a time-consuming static WCET analysis after each code modification to ensure that subsequent optimization steps operate on the critical path. However, a code modification does not always lead to a path switch, making many WCET analyses superfluous. To cope with this problem, we propose a new paradigm called Invariant Path which eliminates the pessimism by indicating whether a path update is mandatory. To demonstrate the paradigm's practical use, we developed a novel optimization called WCET-driven Loop Unswitching which exploits the Invariant Path information. In a case study, our optimization reduced the WCET of real-world benchmarks by up to 18.3%, while exploiting the Invariant Path paradigm led to a reduction of the optimization time by 57.5% on average.
|
| Paul Lokuciejewski, Heiko Falk and Peter Marwedel. WCET-driven Cache-based Procedure Positioning Optimizations. In The 20th Euromicro Conference on Real-Time Systems (ECRTS), pages 321-330 Prague / Czech Republic, July 2008 [BibTeX][PDF][Abstract]@inproceedings { loku:08:ecrts,
author = {Lokuciejewski, Paul and Falk, Heiko and Marwedel, Peter},
title = {WCET-driven Cache-based Procedure Positioning Optimizations},
booktitle = {The 20th Euromicro Conference on Real-Time Systems (ECRTS)},
year = {2008},
pages = {321-330},
address = {Prague / Czech Republic},
month = {jul},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2008-ecrts.pdf},
confidential = {n},
abstract = {Procedure Positioning is a well known compiler optimization aiming at the improvement of the instruction cache behavior. A contiguous mapping of procedures calling each other frequently in the memory avoids overlapping of cache lines and thus decreases the number of cache conflict misses. In standard literature, these positioning techniques are guided by execution profile data and focus on an improved average-case performance. We present two novel positioning optimizations driven by worst-case execution time (WCET) information to effectively minimize the program's worst-case behavior. WCET reductions by 10\% on average are achieved. Moreover, a combination of positioning and the WCET-driven Procedure Cloning optimization is presented improving the WCET analysis by 36\% on average.},
} Procedure Positioning is a well known compiler optimization aiming at the improvement of the instruction cache behavior. A contiguous mapping of procedures calling each other frequently in the memory avoids overlapping of cache lines and thus decreases the number of cache conflict misses. In standard literature, these positioning techniques are guided by execution profile data and focus on an improved average-case performance. We present two novel positioning optimizations driven by worst-case execution time (WCET) information to effectively minimize the program's worst-case behavior. WCET reductions by 10% on average are achieved. Moreover, a combination of positioning and the WCET-driven Procedure Cloning optimization is presented improving the WCET analysis by 36% on average.
|
| Paul Lokuciejewski, Heiko Falk, Peter Marwedel and Henrik Theiling. WCET-Driven, Code-Size Critical Procedure Cloning. In The 11th International Workshop on Software & Compilers for Embedded Systems (SCOPES), pages 21-30 Munich / Germany, March 2008 [BibTeX][PDF][Abstract]@inproceedings { loku:08:scopes,
author = {Lokuciejewski, Paul and Falk, Heiko and Marwedel, Peter and Theiling, Henrik},
title = {WCET-Driven, Code-Size Critical Procedure Cloning},
booktitle = {The 11th International Workshop on Software \& Compilers for Embedded Systems (SCOPES)},
year = {2008},
pages = {21-30},
address = {Munich / Germany},
month = {mar},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2008-scopes.pdf},
confidential = {n},
abstract = {In the domain of the worst-case execution time (WCET) analysis, loops are an inherent source of unpredictability and loss of precision since the determination of tight and safe information on the number of loop iterations is a diffi- cult task. In particular, data-dependent loops whose itera- tion counts depend on function parameters can not be pre- cisely handled by a timing analysis. Procedure Cloning can be exploited to make these loops explicit within the source code allowing a highly precise WCET analysis. In this paper we extend the standard Procedure Cloning optimization by WCET-aware concepts with the objective to improve the tightness of the WCET estimation. Our novel approach is driven by WCET information which succes- sively eliminates code structures leading to overestimated timing results, thus making the code more suitable for the analysis. In addition, the code size increase during the op- timization is monitored and large increases are avoided. The effectiveness of our optimization is shown by tests on real-world benchmarks. After performing our optimiza- tion, the estimated WCET is reduced by up to 64.2\% while the employed code transformations yield an additional code size increase of 22.6\% on average. In contrast, the average- case performance being the original objective of Procedure Cloning showed a slight decrease.},
} In the domain of the worst-case execution time (WCET) analysis, loops are an inherent source of unpredictability and loss of precision since the determination of tight and safe information on the number of loop iterations is a diffi- cult task. In particular, data-dependent loops whose itera- tion counts depend on function parameters can not be pre- cisely handled by a timing analysis. Procedure Cloning can be exploited to make these loops explicit within the source code allowing a highly precise WCET analysis. In this paper we extend the standard Procedure Cloning optimization by WCET-aware concepts with the objective to improve the tightness of the WCET estimation. Our novel approach is driven by WCET information which succes- sively eliminates code structures leading to overestimated timing results, thus making the code more suitable for the analysis. In addition, the code size increase during the op- timization is monitored and large increases are avoided. The effectiveness of our optimization is shown by tests on real-world benchmarks. After performing our optimiza- tion, the estimated WCET is reduced by up to 64.2% while the employed code transformations yield an additional code size increase of 22.6% on average. In contrast, the average- case performance being the original objective of Procedure Cloning showed a slight decrease.
|
| Sascha Plazar, Paul Lokuciejewski and Peter Marwedel. A Retargetable Framework for Multi-objective WCET-aware High-level Compiler Optimizations. In Proceedings of The 29th IEEE Real-Time Systems Symposium (RTSS) WiP, pages 49-52 Barcelona / Spain, December 2008 [BibTeX][PDF][Abstract]@inproceedings { plazar:08:rtss,
author = {Plazar, Sascha and Lokuciejewski, Paul and Marwedel, Peter},
title = {A Retargetable Framework for Multi-objective WCET-aware High-level Compiler Optimizations},
booktitle = {Proceedings of The 29th IEEE Real-Time Systems Symposium (RTSS) WiP},
year = {2008},
pages = {49-52},
address = {Barcelona / Spain},
month = {dec},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2008-rtss.pdf},
confidential = {n},
abstract = {The worst-case execution time (WCET) is a key parameter in the domain of real-time systems and its automatic compiler-based minimization becomes a challenging research area. Although today's embedded system applications are written in a high-level language, most published works consider low-level optimizations which complicate their portability to other processors. In this work, we present a framework for the development of novel WCETdriven high-level optimizations. Our WCET-aware compiler framework provides a multi-target support as well as an integration of different non-functional objectives. It enables multi-objective optimizations, thus opens avenues to a state-of-the-art design of predictable and efficient systems. In addition, the multi-target support provides the opportunity to efficiently evaluate the impact of different compiler optimizations on various processors.},
} The worst-case execution time (WCET) is a key parameter in the domain of real-time systems and its automatic compiler-based minimization becomes a challenging research area. Although today's embedded system applications are written in a high-level language, most published works consider low-level optimizations which complicate their portability to other processors. In this work, we present a framework for the development of novel WCETdriven high-level optimizations. Our WCET-aware compiler framework provides a multi-target support as well as an integration of different non-functional objectives. It enables multi-objective optimizations, thus opens avenues to a state-of-the-art design of predictable and efficient systems. In addition, the multi-target support provides the opportunity to efficiently evaluate the impact of different compiler optimizations on various processors.
|
| Peter Marwedel and Heiko Falk (presentation). Memory-architecture aware compilation. In The ARTIST2 Summer School 2008 in Europe Autrans / France, 2008 [BibTeX][PDF]@inproceedings { marwedel:08:artist2,
author = {Marwedel, Peter and Falk (presentation), Heiko},
title = {Memory-architecture aware compilation},
booktitle = {The ARTIST2 Summer School 2008 in Europe},
year = {2008},
address = {Autrans / France},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2008-artist2summerschool.pdf},
confidential = {n},
} |
| Paul Lokuciejewski, Heiko Falk, Martin Schwarzer and Peter Marwedel. Tighter WCET Estimates by Procedure Cloning. In 7th International Workshop on Worst-Case Execution Time Analysis (WCET), pages 27-32 Pisa/Italy, July 2007 [BibTeX][PDF][Abstract]@inproceedings { loku:07:wcet,
author = {Lokuciejewski, Paul and Falk, Heiko and Schwarzer, Martin and Marwedel, Peter},
title = {Tighter WCET Estimates by Procedure Cloning},
booktitle = {7th International Workshop on Worst-Case Execution Time Analysis (WCET)},
year = {2007},
pages = {27-32},
address = {Pisa/Italy},
month = {jul},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2007-wcet.pdf},
confidential = {n},
abstract = {Embedded software spends most of its execution time in loops. To allow a precise static WCET analysis, each loop iteration should, in theory, be represented by an individual calling context. However, due to the enormous analysis times of real-world applications, this approach is not feasible and requires a reduction of the analysis complexity by limiting the number of considered contexts. This restricted timing analysis results in imprecise WCET estimates. In particular, data-dependent loops with iteration counts depending on function parameters cannot be precisely analyzed. In order to reduce the number of contexts that must be implicitly considered, causing an increase in analysis time, we apply the standard compiler optimization \textem{procedure cloning} which improves the program's predictability by making loops explicit and thus allowing a precise annotation of loop bounds. The result is a tight WCET estimation within a reduced analysis time. Our results indicate that reductions of the WCET between 12\% and 95\% were achieved for real-world benchmarks. In contrast, the reduction of the simulated program execution time remained marginal with only 3\%. As will be also shown, this optimization only produces a small overhead for the WCET analysis.},
} Embedded software spends most of its execution time in loops. To allow a precise static WCET analysis, each loop iteration should, in theory, be represented by an individual calling context. However, due to the enormous analysis times of real-world applications, this approach is not feasible and requires a reduction of the analysis complexity by limiting the number of considered contexts. This restricted timing analysis results in imprecise WCET estimates. In particular, data-dependent loops with iteration counts depending on function parameters cannot be precisely analyzed. In order to reduce the number of contexts that must be implicitly considered, causing an increase in analysis time, we apply the standard compiler optimization procedure cloning which improves the program's predictability by making loops explicit and thus allowing a precise annotation of loop bounds. The result is a tight WCET estimation within a reduced analysis time. Our results indicate that reductions of the WCET between 12% and 95% were achieved for real-world benchmarks. In contrast, the reduction of the simulated program execution time remained marginal with only 3%. As will be also shown, this optimization only produces a small overhead for the WCET analysis.
|
| Robert Pyka, Christoph Faßbach, Manish Verma, Heiko Falk and Peter Marwedel. Operating system integrated energy aware scratchpad allocation strategies for multiprocess applications. In 10th International Workshop on Software & Compilers for Embedded Systems (SCOPES), pages 41-50 Nice/France, April 2007 [BibTeX][PDF][Abstract]@inproceedings { pyka:07:scopes,
author = {Pyka, Robert and Fa\"sbach, Christoph and Verma, Manish and Falk, Heiko and Marwedel, Peter},
title = {Operating system integrated energy aware scratchpad allocation strategies for multiprocess applications},
booktitle = {10th International Workshop on Software \& Compilers for Embedded Systems (SCOPES)},
year = {2007},
pages = {41-50},
address = {Nice/France},
month = {apr},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2007-scopes.pdf},
confidential = {n},
abstract = {Various scratchpad allocation strategies have been developed in the past. Most of them target the reduction of energy consumption. These approaches share the necessity of having direct access to the scratchpad memory. In earlier embedded systems this was always true, but with the increasing complexity of tasks systems have to perform, an additional operating system layer between the hardware and the application is becoming mandatory. This paper presents an approach to integrate a scratchpad memory manager into the operating system. The goal is to minimize energy consumption. In contrast to previous work, compile time knowledge about the application's behavior is taken into account. A set of fast heuristic allocation methods is proposed in this paper. An in-depth study and comparison of achieved energy savings and cycle reductions was performed. The results show that even in the highly dynamic environment of an operating system equipped embedded system, up to 83% energy consumption reduction can be achieved.},
} Various scratchpad allocation strategies have been developed in the past. Most of them target the reduction of energy consumption. These approaches share the necessity of having direct access to the scratchpad memory. In earlier embedded systems this was always true, but with the increasing complexity of tasks systems have to perform, an additional operating system layer between the hardware and the application is becoming mandatory. This paper presents an approach to integrate a scratchpad memory manager into the operating system. The goal is to minimize energy consumption. In contrast to previous work, compile time knowledge about the application's behavior is taken into account. A set of fast heuristic allocation methods is proposed in this paper. An in-depth study and comparison of achieved energy savings and cycle reductions was performed. The results show that even in the highly dynamic environment of an operating system equipped embedded system, up to 83% energy consumption reduction can be achieved.
|
| Paul Lokuciejewski, Heiko Falk, Martin Schwarzer, Peter Marwedel and Henrik Theiling. Influence of Procedure Cloning on WCET Prediction. In International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS), pages 137-142 Salzburg/Austria, September 2007 [BibTeX][PDF][Abstract]@inproceedings { loku:07:codes_isss,
author = {Lokuciejewski, Paul and Falk, Heiko and Schwarzer, Martin and Marwedel, Peter and Theiling, Henrik},
title = {Influence of Procedure Cloning on WCET Prediction},
booktitle = {International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS)},
year = {2007},
pages = {137-142},
address = {Salzburg/Austria},
month = {sep},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2007-codes+isss_2.pdf},
confidential = {n},
abstract = {For the worst-case execution time \textem{(WCET)} analysis, especially loops are an inherent source of unpredictability and loss of precision. This is caused by the difficulty to obtain safe and tight information on the number of iterations executed by a loop in the worst case. In particular, data-dependent loops whose iteration counts depend on function parameters are extremely difficult to analyze precisely. Procedure cloning helps by making such data-dependent loops explicit within the source code, thus making them accessible for high-precision WCET analyses. This paper presents the effect of procedure cloning applied at the source-code level on worst-case execution time. The optimization generates specialized versions of functions being called with constant values as arguments. In standard literature, it is used to enable further optimizations like constant propagation within functions and to reduce calling overhead. We show that procedure cloning for WCET minimization leads to significant improvements. Reductions of the WCET from 12\% up to 95\% were measured for real-life benchmarks. These results demonstrate that procedure cloning improves analyzability and predictability of real-time applications dramatically. In contrast, average-case performance as the criterion procedure cloning was developed for is reduced by only 3\% at most. Our results also show that these WCET reductions only implied small overhead during WCET analysis.},
} For the worst-case execution time (WCET) analysis, especially loops are an inherent source of unpredictability and loss of precision. This is caused by the difficulty to obtain safe and tight information on the number of iterations executed by a loop in the worst case. In particular, data-dependent loops whose iteration counts depend on function parameters are extremely difficult to analyze precisely. Procedure cloning helps by making such data-dependent loops explicit within the source code, thus making them accessible for high-precision WCET analyses. This paper presents the effect of procedure cloning applied at the source-code level on worst-case execution time. The optimization generates specialized versions of functions being called with constant values as arguments. In standard literature, it is used to enable further optimizations like constant propagation within functions and to reduce calling overhead. We show that procedure cloning for WCET minimization leads to significant improvements. Reductions of the WCET from 12% up to 95% were measured for real-life benchmarks. These results demonstrate that procedure cloning improves analyzability and predictability of real-time applications dramatically. In contrast, average-case performance as the criterion procedure cloning was developed for is reduced by only 3% at most. Our results also show that these WCET reductions only implied small overhead during WCET analysis.
|
| Peter Marwedel, Heiko Falk, Sascha Plazar, Robert Pyka and Lars Wehmeyer. Automatic mapping to tightly-coupled memories and cache locking. In Proceedings of 4th HiPEAC Industrial Workshop on Compilers and Architectures Cambridge, UK, August 2007 [BibTeX][PDF][Link]@inproceedings { marwedel:07:hipeac,
author = {Marwedel, Peter and Falk, Heiko and Plazar, Sascha and Pyka, Robert and Lars Wehmeyer,},
title = {Automatic mapping to tightly-coupled memories and cache locking},
booktitle = {Proceedings of 4th HiPEAC Industrial Workshop on Compilers and Architectures},
year = {2007},
address = {Cambridge, UK},
month = {aug},
url = {http://www.hipeac.net/industry_workshop4},
keywords = {wcet},
file = {http://www.hipeac.net/system/files?file=session1_3.ppt},
confidential = {n},
} |
| Manish Verma and Peter Marwedel. Compilation and Simulation Tool Chain for Memory Aware Energy Optimizations. In Workshop on Embedded Computer Systems: Architectures, Modeling, and Simulation (SAMOS VI) Samos, Greece, July 2006 [BibTeX][PDF][Abstract]@inproceedings { verma:06:samos,
author = {Verma, Manish and Marwedel, Peter},
title = {Compilation and Simulation Tool Chain for Memory Aware Energy Optimizations},
booktitle = {Workshop on Embedded Computer Systems: Architectures, Modeling, and Simulation (SAMOS VI)},
year = {2006},
address = {Samos, Greece},
month = {jul},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2006-samos.pdf},
confidential = {n},
abstract = {Memory hierarchies are known to be the energy bottleneck of portable embedded devices. Numerous memory aware energy optimizations have been proposed. However, both the optimization and the validation is performed in an ad-hoc manner as a coherent compilation and simulation framework does not exist as yet. In this paper, we present such a framework for performing memory hierarchy aware energy optimization. Both the compiler and the simulator are configured from a single memory hierarchy description. Significant savings of up to 50\% in the total energy dissipation are reported.},
} Memory hierarchies are known to be the energy bottleneck of portable embedded devices. Numerous memory aware energy optimizations have been proposed. However, both the optimization and the validation is performed in an ad-hoc manner as a coherent compilation and simulation framework does not exist as yet. In this paper, we present such a framework for performing memory hierarchy aware energy optimization. Both the compiler and the simulator are configured from a single memory hierarchy description. Significant savings of up to 50% in the total energy dissipation are reported.
|
| Lars Wehmeyer and Peter Marwedel. Influence of Memory Hierarchies on Predictability for Time Constrained Embedded Software. In Design Automation and Test in Europe (DATE) Munich, Germany, March 2005 [BibTeX][PDF][Abstract]@inproceedings { wehm:05:date,
author = {Wehmeyer, Lars and Marwedel, Peter},
title = {Influence of Memory Hierarchies on Predictability for Time Constrained Embedded Software},
booktitle = {Design Automation and Test in Europe (DATE)},
year = {2005},
address = {Munich, Germany},
month = {mar},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2005-date.pdf},
confidential = {n},
abstract = {Safety-critical embedded systems having to meet real-time constraints are expected to be highly predictable in order to guarantee at design time that certain timing deadlines will always be met. This requirement usually prevents designers from utilizing caches due to their highly dynamic, thus hardly predictable behavior. The integration of scratchpad memories represents an alternative approach which allows the system to benefit from a performance gain comparable to that of caches while at the same time maintaining predictability. In this work, we compare the impact of scratchpad memories and caches on worst case execution time (WCET) analysis results. We show that caches, despite requiring complex techniques, can have a negative impact on the predicted WCET, while the estimated WCET for scratchpad memories scales with the achieved performance gain at no extra analysis cost.},
} Safety-critical embedded systems having to meet real-time constraints are expected to be highly predictable in order to guarantee at design time that certain timing deadlines will always be met. This requirement usually prevents designers from utilizing caches due to their highly dynamic, thus hardly predictable behavior. The integration of scratchpad memories represents an alternative approach which allows the system to benefit from a performance gain comparable to that of caches while at the same time maintaining predictability. In this work, we compare the impact of scratchpad memories and caches on worst case execution time (WCET) analysis results. We show that caches, despite requiring complex techniques, can have a negative impact on the predicted WCET, while the estimated WCET for scratchpad memories scales with the achieved performance gain at no extra analysis cost.
|
| Peter Marwedel, Manish Verma and Lars Wehmeyer. Compiler optimizations improving the processor/memory interface. In Workshop on Optimizing Compiler Assisted SoC Assembly (OCASA) September 2005 [BibTeX]@inproceedings { marw:05:ocasa,
author = {Marwedel, Peter and Verma, Manish and Wehmeyer, Lars},
title = {Compiler optimizations improving the processor/memory interface},
booktitle = {Workshop on Optimizing Compiler Assisted SoC Assembly (OCASA)},
year = {2005},
month = {sep},
confidential = {n},
} |
| Manish Verma and Peter Marwedel. Memory Optimization Techniques for Low-Power Embedded Processors. In IFIP VIVA Workshop - Fundamentals and Methods for Low-Power Information Processing Bonn, Germany, September 2005 [BibTeX][PDF][Abstract]@inproceedings { verma:05:viva,
author = {Verma, Manish and Marwedel, Peter},
title = {Memory Optimization Techniques for Low-Power Embedded Processors},
booktitle = {IFIP VIVA Workshop - Fundamentals and Methods for Low-Power Information Processing},
year = {2005},
address = {Bonn, Germany},
month = {sep},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2005-viva.pdf},
confidential = {n},
abstract = {Power consumption is an important design issue for contemporary portable embedded devices. It is known that the next generation of portable devices will feature faster processors and larger memories, both of which require high operational power. %This is expected to Memory subsystem has already been identified as the energy bottleneck of the entire system. Consequently, memory hierarchies are being constructed to reduce the memory subsystem's energy dissipation. Caches and scratchpad memories represent two contrasting memory architectures. Scratchpads are both area and power efficient than caches. However, they require explicit support from the compiler for managing their contents. In this work, we present three approaches for the prudent utilization of the scratchpad memory of an ARM7 processor and of a M5 DSP based system. The first approach is based on the following observations. Firstly, a small memory requires less energy per access than that by a large memory. Secondly, applications in general consist of small and frequently accessed arrays and large but infrequently accessed arrays. Consequently, the approach partitions the large scratchpad into several small scratchpads. The arrays are also statically mapped such that the small arrays are mapped to small and energy efficient scratchpads. The approach leads to average energy savings of 52\% and 35\% in the data memory subsystem of the ARM7 and the M5 DSP, respectively. The second approach utilizes the scratchpad as an instruction buffer in a cache based memory hierarchy. The approach models the cache as a conflict graph and assigns instructions to the scratchpad. The objective is to minimize the energy consumption of the system while preserving the predictable behavior of the memory hierarchy. The approach results in an average energy saving of 21\% against the above approach for the ARM7 based system. The last approach optimizes the energy consumption of the system by overlaying memory objects (\textem{i.e.} code segments and data elements) on to the scratchpad. Memory objects with non-conflicting life-times are assigned to the same location on the scratchpad. This improves the scratchpad utilization, however, it requires copying memory objects on and off the scratchpad during the execution of the application. Average energy reductions of 34\% and 33\% are reported for the ARM7 and the M5 DSP based systems, respectively.},
} Power consumption is an important design issue for contemporary portable embedded devices. It is known that the next generation of portable devices will feature faster processors and larger memories, both of which require high operational power. %This is expected to Memory subsystem has already been identified as the energy bottleneck of the entire system. Consequently, memory hierarchies are being constructed to reduce the memory subsystem's energy dissipation. Caches and scratchpad memories represent two contrasting memory architectures. Scratchpads are both area and power efficient than caches. However, they require explicit support from the compiler for managing their contents. In this work, we present three approaches for the prudent utilization of the scratchpad memory of an ARM7 processor and of a M5 DSP based system. The first approach is based on the following observations. Firstly, a small memory requires less energy per access than that by a large memory. Secondly, applications in general consist of small and frequently accessed arrays and large but infrequently accessed arrays. Consequently, the approach partitions the large scratchpad into several small scratchpads. The arrays are also statically mapped such that the small arrays are mapped to small and energy efficient scratchpads. The approach leads to average energy savings of 52% and 35% in the data memory subsystem of the ARM7 and the M5 DSP, respectively. The second approach utilizes the scratchpad as an instruction buffer in a cache based memory hierarchy. The approach models the cache as a conflict graph and assigns instructions to the scratchpad. The objective is to minimize the energy consumption of the system while preserving the predictable behavior of the memory hierarchy. The approach results in an average energy saving of 21% against the above approach for the ARM7 based system. The last approach optimizes the energy consumption of the system by overlaying memory objects (i.e. code segments and data elements) on to the scratchpad. Memory objects with non-conflicting life-times are assigned to the same location on the scratchpad. This improves the scratchpad utilization, however, it requires copying memory objects on and off the scratchpad during the execution of the application. Average energy reductions of 34% and 33% are reported for the ARM7 and the M5 DSP based systems, respectively.
|
| Peter Marwedel. Towards laying common grounds for embedded system design education. In Workshop on Embedded Systems Education (WESE) 2005 [BibTeX][PDF][Abstract]@inproceedings { marwedel:05:wese,
author = {Marwedel, Peter},
title = {Towards laying common grounds for embedded system design education},
booktitle = {Workshop on Embedded Systems Education (WESE)},
year = {2005},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2005-wese.pdf},
confidential = {n},
abstract = {In this paper, we propose to introduce a common introductory course for embedded system education. The course puts the different areas of embedded system design into perspective and avoids an early over-specialization. Also, it motivates the students for attending more advanced theoretical courses. The content, the structure and the prerequisites of such a course are outlined. The course requires a basic understanding of computer hardware and software and can typically be taught in the second or third year.},
} In this paper, we propose to introduce a common introductory course for embedded system education. The course puts the different areas of embedded system design into perspective and avoids an early over-specialization. Also, it motivates the students for attending more advanced theoretical courses. The content, the structure and the prerequisites of such a course are outlined. The course requires a basic understanding of computer hardware and software and can typically be taught in the second or third year.
|
| Manish Verma, Klaus Petzold, Lars Wehmeyer, Heiko Falk and Peter Marwedel. Scratchpad Sharing Strategies for Multiprocess Embedded Systems: A First Approach. In IEEE 3rd Workshop on Embedded System for Real-Time Multimedia (ESTIMedia), pages 115-120 Jersey City, USA, September 2005 [BibTeX][PDF][Abstract]@inproceedings { verma:05:estimedia,
author = {Verma, Manish and Petzold, Klaus and Wehmeyer, Lars and Falk, Heiko and Marwedel, Peter},
title = {Scratchpad Sharing Strategies for Multiprocess Embedded Systems: A First Approach},
booktitle = {IEEE 3rd Workshop on Embedded System for Real-Time Multimedia (ESTIMedia)},
year = {2005},
pages = {115-120},
address = {Jersey City, USA},
month = {sep},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2005-estimedia.pdf},
confidential = {n},
abstract = {Portable embedded systems require diligence in managing their energy consumption. Thus, power efficient processors coupled with onchip memories (e.g. caches, scratchpads) are the base of today's portable devices. Scratchpads are more energy efficient than caches but require software support for their utilization. Portable devices' applications consist of multiple processes for different tasks. However, all the previous scratchpad allocation approaches only consider single process applications. In this paper, we propose a set of optimal strategies to reduce the energy consumption of applications by sharing the scratchpad among multiple processes. The strategies assign both code and data elements to the scratchpad and result in average total energy reductions of 9\%-20\% against a published single process approach. Furthermore, the strategies generate Pareto-optimal curves for the applications allowing design time exploration of energy/scratchpad size tradeoffs.},
} Portable embedded systems require diligence in managing their energy consumption. Thus, power efficient processors coupled with onchip memories (e.g. caches, scratchpads) are the base of today's portable devices. Scratchpads are more energy efficient than caches but require software support for their utilization. Portable devices' applications consist of multiple processes for different tasks. However, all the previous scratchpad allocation approaches only consider single process applications. In this paper, we propose a set of optimal strategies to reduce the energy consumption of applications by sharing the scratchpad among multiple processes. The strategies assign both code and data elements to the scratchpad and result in average total energy reductions of 9%-20% against a published single process approach. Furthermore, the strategies generate Pareto-optimal curves for the applications allowing design time exploration of energy/scratchpad size tradeoffs.
|
| Lars Wehmeyer and Peter Marwedel. Influence of Onchip Scratchpad Memories on WCET prediction. In Proceedings of the 4th International Workshop on Worst-Case Execution Time (WCET) Analysis Catania, Sicily, Italy, June 2004 [BibTeX][PDF][Abstract]@inproceedings { wehm:04:wcet,
author = {Wehmeyer, Lars and Marwedel, Peter},
title = {Influence of Onchip Scratchpad Memories on WCET prediction},
booktitle = {Proceedings of the 4th International Workshop on Worst-Case Execution Time (WCET) Analysis},
year = {2004},
address = {Catania, Sicily, Italy},
month = {jun},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2004-WCET.pdf},
confidential = {n},
abstract = {In contrast to standard PCs and many high-performance computer systems, systems that have to meet real-time requirements usually do not feature caches, since caches primarily improve the average case performance, whereas their impact on WCET is generally hard to predict. Especially in embedded systems, scratchpad memories have become popular. Since these small, fast memories can be controlled by the programmer or the compiler, their behavior is perfectly predictable. In this paper, we study for the first time the impact of scratchpad memories on worst case execution time (WCET) prediction. Our results indicate that scratchpads can significantly improve WCET at no extra analysis cost.},
} In contrast to standard PCs and many high-performance computer systems, systems that have to meet real-time requirements usually do not feature caches, since caches primarily improve the average case performance, whereas their impact on WCET is generally hard to predict. Especially in embedded systems, scratchpad memories have become popular. Since these small, fast memories can be controlled by the programmer or the compiler, their behavior is perfectly predictable. In this paper, we study for the first time the impact of scratchpad memories on worst case execution time (WCET) prediction. Our results indicate that scratchpads can significantly improve WCET at no extra analysis cost.
|
| Lars Wehmeyer, Urs Helmig and Peter Marwedel. Compiler-optimized Usage of Partitioned Memories. In Proceedings of the 3rd Workshop on Memory Performance Issues (WMPI2004) Munich, Germany, June 2004 [BibTeX][PDF][Abstract]@inproceedings { wehm:04:wmpi,
author = {Wehmeyer, Lars and Helmig, Urs and Marwedel, Peter},
title = {Compiler-optimized Usage of Partitioned Memories},
booktitle = {Proceedings of the 3rd Workshop on Memory Performance Issues (WMPI2004)},
year = {2004},
address = {Munich, Germany},
month = {jun},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2004-WMPI.pdf},
confidential = {n},
abstract = {In order to meet the requirements concerning both performance and energy consumption in embedded systems, new memory architectures are being introduced. Beside the well-known use of caches in the memory hierarchy, processor cores today also include small onchip memories called scratchpad memories whose usage is not controlled by hardware, but rather by the programmer or the compiler. Techniques for utilization of these scratchpads have been known for some time. Some new processors provide more than one scratchpad, making it necessary to enhance the workflow such that this complex memory architecture can be efficiently utilized. In this work, we present an energy model and an ILP formulation to optimally assign memory objects to different partitions of scratchpad memories at compile time, achieving energy savings of up to 22\% compared to previous approaches.},
} In order to meet the requirements concerning both performance and energy consumption in embedded systems, new memory architectures are being introduced. Beside the well-known use of caches in the memory hierarchy, processor cores today also include small onchip memories called scratchpad memories whose usage is not controlled by hardware, but rather by the programmer or the compiler. Techniques for utilization of these scratchpads have been known for some time. Some new processors provide more than one scratchpad, making it necessary to enhance the workflow such that this complex memory architecture can be efficiently utilized. In this work, we present an energy model and an ILP formulation to optimally assign memory objects to different partitions of scratchpad memories at compile time, achieving energy savings of up to 22% compared to previous approaches.
|
| Manish Verma, Lars Wehmeyer and Peter Marwedel. Cache Aware Scratchpad Allocation. In DATE Paris/France, February 2004 [BibTeX][PDF][Abstract]@inproceedings { verma:04:date,
author = {Verma, Manish and Wehmeyer, Lars and Marwedel, Peter},
title = {Cache Aware Scratchpad Allocation},
booktitle = {DATE},
year = {2004},
address = {Paris/France},
month = {feb},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2004-date.pdf},
confidential = {n},
abstract = {In the context of portable embedded systems, reducing energy is one of the prime objectives. Most high-end embedded microprocessors include onchip instruction and data caches, along with a small energy efficient scratchpad. Previous approaches for utilizing scratchpad did not consider caches and hence fail for the au courant architecture. In the presented work, we use the scratchpad for storing instructions and propose a generic Cache Aware Scratchpad Allocation (CASA) algorithm. We report an average reduction of 8-29\% in instruction memory energy consumption compared to a previously published technique for benchmarks from the Mediabench suite. The scratchpad in the presented architecture is similar to a preloaded loop cache. Comparing the energy consumption of our approach against preloaded loop caches, we report average energy savings of 20-44\%.},
} In the context of portable embedded systems, reducing energy is one of the prime objectives. Most high-end embedded microprocessors include onchip instruction and data caches, along with a small energy efficient scratchpad. Previous approaches for utilizing scratchpad did not consider caches and hence fail for the au courant architecture. In the presented work, we use the scratchpad for storing instructions and propose a generic Cache Aware Scratchpad Allocation (CASA) algorithm. We report an average reduction of 8-29% in instruction memory energy consumption compared to a previously published technique for benchmarks from the Mediabench suite. The scratchpad in the presented architecture is similar to a preloaded loop cache. Comparing the energy consumption of our approach against preloaded loop caches, we report average energy savings of 20-44%.
|
| Markus Lorenz and Peter Marwedel. Phase Coupled Code Generation for DSPs Using a Genetic Algorithm. In DATE, pages 1270-1275 June 2004 [BibTeX][PDF][Abstract]@inproceedings { lorenz:04:date,
author = {Lorenz, Markus and Marwedel, Peter},
title = {Phase Coupled Code Generation for DSPs Using a Genetic Algorithm},
booktitle = {DATE},
year = {2004},
pages = {1270-1275},
month = {jun},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2004-date-lorenz.pdf},
confidential = {n},
abstract = {The growing use of digital signal processors (DSPs) in embedded systems necessitates the use of optimizing compilers supporting special hardware features. Due to the irregular architectures present in todays DSPs there is a need of compilers which are capable of performing a phase coupling of the highly interdependent code generation subtasks and a graph based code selection. In this paper we present a code generator which performs a graph based code selection and a complete phase coupling of code selection, instruction scheduling (including compaction) and register allocation. In addition, our code generator takes into account effects of the subsequent address code generation phase. In order to solve the phase coupling problem and to handle the problem complexity, our code generator is based on a genetic algorithm. Experimental results for several benchmarks and an MP3 application for two DSPs show the effectiveness and the retargetability of our approach. Using the presented techniques, the number of execution cycles is reduced by 51\% on average for the M3-DSP and by 38\% on average for the ADSP2100 compared to standard techniques.},
} The growing use of digital signal processors (DSPs) in embedded systems necessitates the use of optimizing compilers supporting special hardware features. Due to the irregular architectures present in todays DSPs there is a need of compilers which are capable of performing a phase coupling of the highly interdependent code generation subtasks and a graph based code selection. In this paper we present a code generator which performs a graph based code selection and a complete phase coupling of code selection, instruction scheduling (including compaction) and register allocation. In addition, our code generator takes into account effects of the subsequent address code generation phase. In order to solve the phase coupling problem and to handle the problem complexity, our code generator is based on a genetic algorithm. Experimental results for several benchmarks and an MP3 application for two DSPs show the effectiveness and the retargetability of our approach. Using the presented techniques, the number of execution cycles is reduced by 51% on average for the M3-DSP and by 38% on average for the ADSP2100 compared to standard techniques.
|
| Peter Marwedel, Lars Wehmeyer, Manish Verma, Stefan Steinke and Urs Helmig. Fast, predictable and low energy memory references through architecture-aware compilation. In ASPDAC, pages 4-11 January 2004 [BibTeX][PDF][Abstract]@inproceedings { marw:04:aspdac,
author = {Marwedel, Peter and Wehmeyer, Lars and Verma, Manish and Steinke, Stefan and Helmig, Urs},
title = {Fast, predictable and low energy memory references through architecture-aware compilation},
booktitle = {ASPDAC},
year = {2004},
pages = {4-11},
month = {jan},
keywords = {wcet},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2004-aspdac-spm.pdf},
confidential = {n},
abstract = {The design of future high-performance embedded systems is hampered by two problems: First, the required hardware needs more energy than is available from batteries. Second, current cache-based approaches for bridging the increasing speed gap between processors and memories cannot guarantee predictable real-time behavior. A contribution to solving both problems is made in this paper which describes a comprehensive set of algorithms that can be applied at design time in order to maximally exploit scratch pad memories (SPMs). We show that both the energy consumption as well as the computed worst case execution time (WCET) can be reduced by up to to 80\% and 48\%, respectively, by establishing a strong link between the memory architecture and the compiler.},
} The design of future high-performance embedded systems is hampered by two problems: First, the required hardware needs more energy than is available from batteries. Second, current cache-based approaches for bridging the increasing speed gap between processors and memories cannot guarantee predictable real-time behavior. A contribution to solving both problems is made in this paper which describes a comprehensive set of algorithms that can be applied at design time in order to maximally exploit scratch pad memories (SPMs). We show that both the energy consumption as well as the computed worst case execution time (WCET) can be reduced by up to to 80% and 48%, respectively, by establishing a strong link between the memory architecture and the compiler.
|
| Manish Verma, Lars Wehmeyer and Peter Marwedel. Dynamic Overlay of Scratchpad Memory for Energy Minimization. In International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS) Stockholm, Sweden, September 2004 [BibTeX][PDF][Abstract]@inproceedings { verma:04:codes,
author = {Verma, Manish and Wehmeyer, Lars and Marwedel, Peter},
title = {Dynamic Overlay of Scratchpad Memory for Energy Minimization},
booktitle = {International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS)},
year = {2004},
address = {Stockholm, Sweden},
month = {sep},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2004-isss.pdf},
confidential = {n},
abstract = {The memory subsystem accounts for a significant portion of the aggregate energy budget of contemporary embedded systems. Moreover, there exists a large potential for optimizing the energy consumption of the memory subsystem. Consequently, novel memories as well as novel algorithms for their efficient utilization are being designed. Scratchpads are known to perform better than caches in terms of power, performance, area and predictability. However, unlike caches they depend upon software allocation techniques for their utilization. In this paper, we present an allocation technique which analyzes the application and inserts instructions to dynamically copy both code segments and variables onto the scratchpad at runtime. We demonstrate that the problem of dynamically overlaying scratchpad is an extension of the Global Register Allocation problem. The overlay problem is solved optimally using ILP formulation techniques. Our approach improves upon the only previously known allocation technique for statically allocating both variables and code segments onto the scratchpad. Experiments report an average reduction of 34\% and 18\% in the energy consumption and the runtime of the applications, respectively. A minimal increase in code size is also reported.},
} The memory subsystem accounts for a significant portion of the aggregate energy budget of contemporary embedded systems. Moreover, there exists a large potential for optimizing the energy consumption of the memory subsystem. Consequently, novel memories as well as novel algorithms for their efficient utilization are being designed. Scratchpads are known to perform better than caches in terms of power, performance, area and predictability. However, unlike caches they depend upon software allocation techniques for their utilization. In this paper, we present an allocation technique which analyzes the application and inserts instructions to dynamically copy both code segments and variables onto the scratchpad at runtime. We demonstrate that the problem of dynamically overlaying scratchpad is an extension of the Global Register Allocation problem. The overlay problem is solved optimally using ILP formulation techniques. Our approach improves upon the only previously known allocation technique for statically allocating both variables and code segments onto the scratchpad. Experiments report an average reduction of 34% and 18% in the energy consumption and the runtime of the applications, respectively. A minimal increase in code size is also reported.
|
| Markus Lorenz, Peter Marwedel, Thorsten Dräger, Gerhard Fettweis and Rainer Leupers. Compiler based Exploration of DSP Energy Savings by SIMD Operations. In ASPDAC, pages 839-842 June 2004 [BibTeX][PDF][Abstract]@inproceedings { lorenz:04:aspdac,
author = {Lorenz, Markus and Marwedel, Peter and Dr\"ager, Thorsten and Fettweis, Gerhard and Leupers, Rainer},
title = {Compiler based Exploration of DSP Energy Savings by SIMD Operations},
booktitle = {ASPDAC},
year = {2004},
pages = {839-842},
month = {jun},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2004-aspdac-lorenz.pdf},
confidential = {n},
abstract = {The growing use of digital signal processors (DSPs) in embedded systems necessitates the use of optimizing compilers supporting their special architecture features. Besides the irregular DSP architectures for reducing chip size and energy consumption, single instruction multiple data (SIMD) functionality is frequently integrated with the intention of performance improvement. In order to get an energy-efficient system consisting of processor and compiler, it is necessary to optimize hardware as well as software. It is not obvious that SIMD operations can save any energy: if n operations are executed in parallel, each of them might consume the same amount of energy as if there were executed sequentially. Up to now, no work has been done to investigate the influence of compiler generated code containing SIMD operations w.r.t. the energy consumption. This paper deals with the exploration of the energy saving potential of SIMD operations for a DSP by using a generic compilation framework including an integrated instruction level energy cost model for our target architecture. Effects of SIMD operations on the energy consumption are shown for several benchmarks and an MP3 application.},
} The growing use of digital signal processors (DSPs) in embedded systems necessitates the use of optimizing compilers supporting their special architecture features. Besides the irregular DSP architectures for reducing chip size and energy consumption, single instruction multiple data (SIMD) functionality is frequently integrated with the intention of performance improvement. In order to get an energy-efficient system consisting of processor and compiler, it is necessary to optimize hardware as well as software. It is not obvious that SIMD operations can save any energy: if n operations are executed in parallel, each of them might consume the same amount of energy as if there were executed sequentially. Up to now, no work has been done to investigate the influence of compiler generated code containing SIMD operations w.r.t. the energy consumption. This paper deals with the exploration of the energy saving potential of SIMD operations for a DSP by using a generic compilation framework including an integrated instruction level energy cost model for our target architecture. Effects of SIMD operations on the energy consumption are shown for several benchmarks and an MP3 application.
|
| Peter Marwedel and Birgit Sirocic. Bridges to computer architectures education. In Proceedings of the Workshop of Computer Architecture Education (WCAE) Munich, Germany, June 2004 [BibTeX][PDF][Abstract]@inproceedings { marwedel:04:wcae,
author = {Marwedel, Peter and Sirocic, Birgit},
title = {Bridges to computer architectures education},
booktitle = {Proceedings of the Workshop of Computer Architecture Education (WCAE)},
year = {2004},
address = {Munich, Germany},
month = {jun},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2004-wcae.pdf},
confidential = {n},
abstract = {Bridging the gap between the student's current knowledge and the technical details of computer systems is frequently required for the education of undergraduate students or students with a lack of previous technical knowledge. In this paper we will describe how to build bridges by the combination of introducing Flash-animations and the educational units of the RaVi system. In a first step the Flash-based animations make the students familiar with the underlaying principles. After that the student could jump to the more technical context by using the educational unit of RaVi. We have developed two bridges, one for explaining the principles of cache coherency protocols and the other for showing the concept of processor pipelines.},
} Bridging the gap between the student's current knowledge and the technical details of computer systems is frequently required for the education of undergraduate students or students with a lack of previous technical knowledge. In this paper we will describe how to build bridges by the combination of introducing Flash-animations and the educational units of the RaVi system. In a first step the Flash-based animations make the students familiar with the underlaying principles. After that the student could jump to the more technical context by using the educational unit of RaVi. We have developed two bridges, one for explaining the principles of cache coherency protocols and the other for showing the concept of processor pipelines.
|
| Heiko Falk and Peter Marwedel. Control Flow driven Splitting of Loop Nests at the Source Code Level. In Design, Automation and Test in Europe (DATE) 2003, pages 410-415 Munich/Germany, March 2003 [BibTeX][PDF][Abstract]@inproceedings { falk:03:date,
author = {Falk, Heiko and Marwedel, Peter},
title = {Control Flow driven Splitting of Loop Nests at the Source Code Level},
booktitle = {Design, Automation and Test in Europe (DATE) 2003},
year = {2003},
pages = {410-415},
address = {Munich/Germany},
month = {mar},
keywords = {sco},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2003-date.pdf},
confidential = {n},
abstract = {This paper presents a novel source code transformation for control flow optimization called loop nest splitting which minimizes the number of executed if-statements in loop nests of embedded multimedia applications. The goal of the optimization is to reduce runtimes and energy consumption. The analysis techniques are based on precise mathematical models combined with genetic algorithms. Due to the inherent portability of source code transformations, a very detailed benchmarking using 10 different processors can be performed. The application of our implemented algorithms to three real-life multimedia benchmarks leads to average speed-ups by 23.6\% - 62.1\% and energy savings by 19.6\% - 57.7\%. Furthermore, our optimization also leads to advantageous pipeline and cache performance.},
} This paper presents a novel source code transformation for control flow optimization called loop nest splitting which minimizes the number of executed if-statements in loop nests of embedded multimedia applications. The goal of the optimization is to reduce runtimes and energy consumption. The analysis techniques are based on precise mathematical models combined with genetic algorithms. Due to the inherent portability of source code transformations, a very detailed benchmarking using 10 different processors can be performed. The application of our implemented algorithms to three real-life multimedia benchmarks leads to average speed-ups by 23.6% - 62.1% and energy savings by 19.6% - 57.7%. Furthermore, our optimization also leads to advantageous pipeline and cache performance.
|
| Manish Verma, Lars Wehmeyer and Peter Marwedel. Efficient Scratchpad Allocation Algorithms for Energy Constrained Embedded Systems. In PACS 2003 San Diego, CA 2003, June 2003, Also in: Lecture Notes in Computer Science (LCNS 3164) Vol. 3164/2004 [BibTeX][PDF][Abstract]@inproceedings { verma:03:pacs,
author = {Verma, Manish and Wehmeyer, Lars and Marwedel, Peter},
title = {Efficient Scratchpad Allocation Algorithms for Energy Constrained Embedded Systems},
booktitle = {PACS 2003},
year = {2003},
address = {San Diego, CA 2003},
month = {jun},
note = {Also in: Lecture Notes in Computer Science (LCNS 3164) Vol. 3164/2004},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2003-pacs.pdf},
confidential = {n},
abstract = {In the context of portable embedded systems, reducing energy is one of the prime objectives. Memories are responsible for a significant percentage of a system's aggregate energy consumption. Consequently, novel memories as well as novel memory hierarchies are being designed to reduce the energy consumption. Caches and scratchpads are two contrasting variants of memory architectures. The former relies completely on hardware logic while the latter requires software for its utilization. Most high-end embedded microprocessors today include onchip instruction and data caches along with a scratchpad. Previous software approaches for utilizing scratchpad did not consider caches and hence fail for the prevalent high-end system architectures. In this work, we use the scratchpad for storing instructions. We solve the allocation problem using a greedy heuristic and also solve it optimally using an ILP formulation. We report an average reduction of 20.7\% in instruction memory energy consumption compared to a previously published technique. Larger reductions are also reported when the problem is solved optimally. The scratchpad in the presented architecture is similar to a preloaded loop cache. Comparing the energy consumption of our approach against that of preloaded loop caches, we report average energy savings of 28.9\% using the heuristic.},
} In the context of portable embedded systems, reducing energy is one of the prime objectives. Memories are responsible for a significant percentage of a system's aggregate energy consumption. Consequently, novel memories as well as novel memory hierarchies are being designed to reduce the energy consumption. Caches and scratchpads are two contrasting variants of memory architectures. The former relies completely on hardware logic while the latter requires software for its utilization. Most high-end embedded microprocessors today include onchip instruction and data caches along with a scratchpad. Previous software approaches for utilizing scratchpad did not consider caches and hence fail for the prevalent high-end system architectures. In this work, we use the scratchpad for storing instructions. We solve the allocation problem using a greedy heuristic and also solve it optimally using an ILP formulation. We report an average reduction of 20.7% in instruction memory energy consumption compared to a previously published technique. Larger reductions are also reported when the problem is solved optimally. The scratchpad in the presented architecture is similar to a preloaded loop cache. Comparing the energy consumption of our approach against that of preloaded loop caches, we report average energy savings of 28.9% using the heuristic.
|
| Peter Marwedel and Birgit Sirocic. Overcoming the Limitations of Traditional Media For Teaching Modern Processor Design. In International Conference on Microelectronic Systems Education (MSE 2003) Anaheim, June 2003 [BibTeX][PDF][Abstract]@inproceedings { marwedel:03:mse,
author = {Marwedel, Peter and Sirocic, Birgit},
title = {Overcoming the Limitations of Traditional Media For Teaching Modern Processor Design},
booktitle = {International Conference on Microelectronic Systems Education (MSE 2003)},
year = {2003},
address = {Anaheim},
month = {jun},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2003-mse.pdf},
confidential = {n},
abstract = {Understanding modern processors requires a good knowledge of the dynamic behavior of processors. Traditional media like books can be used for describing the dynamic behavior of processors. Visualization of this behavior, however, is impossible, due to the static nature of books. In this paper, we describe a Java-based tool for visualizing the dynamic behavior of hardware structures, called RaVi (abbreviation for the German equivalent of "computer architecture visualization"). Available RaVi components include models of a microcoded MIPS architecture, of a MIPS pipeline, of scoreboarding, Tomasulo's algorithm and the MESI multiprocessor cache protocol. These models were found to be more useful than general simulators in classroom use. The Java-based design also enables Internet-based distance learning. Tools are available at http://ls12-www.cs.tu-dortmund.de/ravi.},
} Understanding modern processors requires a good knowledge of the dynamic behavior of processors. Traditional media like books can be used for describing the dynamic behavior of processors. Visualization of this behavior, however, is impossible, due to the static nature of books. In this paper, we describe a Java-based tool for visualizing the dynamic behavior of hardware structures, called RaVi (abbreviation for the German equivalent of "computer architecture visualization"). Available RaVi components include models of a microcoded MIPS architecture, of a MIPS pipeline, of scoreboarding, Tomasulo's algorithm and the MESI multiprocessor cache protocol. These models were found to be more useful than general simulators in classroom use. The Java-based design also enables Internet-based distance learning. Tools are available at http://ls12-www.cs.tu-dortmund.de/ravi.
|
| Manish Verma, Stefan Steinke and Peter Marwedel. Data Partitioning for Maximal Scratchpad Usage. In ASPDAC 2003 KitaKyushu/Japan, January 2003 [BibTeX][PDF][Abstract]@inproceedings { verma:03:aspdac,
author = {Verma, Manish and Steinke, Stefan and Marwedel, Peter},
title = {Data Partitioning for Maximal Scratchpad Usage},
booktitle = {ASPDAC 2003},
year = {2003},
address = {KitaKyushu/Japan},
month = {jan},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2003-aspdac.pdf},
confidential = {n},
abstract = {The energy consumption for Mobile Embedded Systems is a limiting factor because of today's battery capacities. The memory subsystem consumes a large chunk of the energy, necessitating its efficient utilization. Energy efficient scratchpads are thus becoming common, though unlike caches they require to be explicitly utilized. In this paper, an algorithm integrated into a compiler is presented which analyzes the application, partitions an array variable whenever its beneficial, appropriately modifies the application and selects the best set of variables and program parts to be placed onto the scratchpad. Results show an energy improvement between 5.7\% and 17.6\% for a variety of applications against a previously known algorithm.},
} The energy consumption for Mobile Embedded Systems is a limiting factor because of today's battery capacities. The memory subsystem consumes a large chunk of the energy, necessitating its efficient utilization. Energy efficient scratchpads are thus becoming common, though unlike caches they require to be explicitly utilized. In this paper, an algorithm integrated into a compiler is presented which analyzes the application, partitions an array variable whenever its beneficial, appropriately modifies the application and selects the best set of variables and program parts to be placed onto the scratchpad. Results show an energy improvement between 5.7% and 17.6% for a variety of applications against a previously known algorithm.
|
| Rainer Leupers, Oliver Wahlen, Manuel Hohenauer, Tim Kogel and Peter Marwedel. An Executable Intermediate Representation for Retargetable Compilation and High-Level Code Optimization. In International Conference on Information Communication Technologies in Education (SAMOS 2003) June 2003 [BibTeX][PDF][Abstract]@inproceedings { leupers:03:samos,
author = {Leupers, Rainer and Wahlen, Oliver and Hohenauer, Manuel and Kogel, Tim and Marwedel, Peter},
title = {An Executable Intermediate Representation for Retargetable Compilation and High-Level Code Optimization},
booktitle = {International Conference on Information Communication Technologies in Education (SAMOS 2003)},
year = {2003},
month = {jun},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2003-samosIII.pdf},
confidential = {n},
abstract = {Due to fast time-to-market and IP reuse requirements, an increasing amount of the functionality of embedded HW/SW systems is implemented in software. As a consequence, software programming languages like C play an important role in system specification, design, and validation. Besides many other advantages, the C language offers executable specifications, with clear semantics and high simulation speed. However, virtually any tool operating on C specifications has to convert C sources into some intermediate representation (IR), during which the executability is normally lost. In order to overcome this problem, this paper describes a novel IR format, called IR-C, for the use in C based design tools, which combines the simplicity of three address code with the executability of C. Besides the IR-C format and its generation from ANSI C, we also describe its applications in the areas of validation, retargetable compilation, and sourcelevel code optimization.},
} Due to fast time-to-market and IP reuse requirements, an increasing amount of the functionality of embedded HW/SW systems is implemented in software. As a consequence, software programming languages like C play an important role in system specification, design, and validation. Besides many other advantages, the C language offers executable specifications, with clear semantics and high simulation speed. However, virtually any tool operating on C specifications has to convert C sources into some intermediate representation (IR), during which the executability is normally lost. In order to overcome this problem, this paper describes a novel IR format, called IR-C, for the use in C based design tools, which combines the simplicity of three address code with the executability of C. Besides the IR-C format and its generation from ANSI C, we also describe its applications in the areas of validation, retargetable compilation, and sourcelevel code optimization.
|
| Peter Marwedel and Birgit Sirocic. Multimedia componets for the visualization of dynamic behavior in computer architectures. In Proceedings of the Workshop of Computer Architecture Education (WCAE'03) San Diego, CA, June 2003 [BibTeX][PDF][Abstract]@inproceedings { marwedel:03:wcae,
author = {Marwedel, Peter and Sirocic, Birgit},
title = {Multimedia componets for the visualization of dynamic behavior in computer architectures},
booktitle = {Proceedings of the Workshop of Computer Architecture Education (WCAE'03)},
year = {2003},
address = {San Diego, CA},
month = {jun},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2003-wcae.pdf},
confidential = {n},
abstract = {Understanding modern processors requires a good knowledge of the dynamic behavior of processors. Traditional media like books use text for describing the dynamic behavior of processors. Visualization of this behavior, however, in impossible, due to the static nature of books. In this paper, we describe multimedia components for visualizing the dynamic behavior of hardware structures, called RaVi (abbreviation for the German equivalent of "computer architecture visualization"). Available RaVi components include models of a microcoded MIPS architecture, of a MIPS pipeline, of scoreboarding, Tomasulo's algorithm and the MESI multiprocessor cache protocol.},
} Understanding modern processors requires a good knowledge of the dynamic behavior of processors. Traditional media like books use text for describing the dynamic behavior of processors. Visualization of this behavior, however, in impossible, due to the static nature of books. In this paper, we describe multimedia components for visualizing the dynamic behavior of hardware structures, called RaVi (abbreviation for the German equivalent of "computer architecture visualization"). Available RaVi components include models of a microcoded MIPS architecture, of a MIPS pipeline, of scoreboarding, Tomasulo's algorithm and the MESI multiprocessor cache protocol.
|
| Stefan Steinke, Lars Wehmeyer, Bo-Sik Lee and Peter Marwedel. Assigning Program and Data Objects to Scratchpad for Energy Reduction. In DATE 2002 Paris/France, March 2002 [BibTeX][PDF][Abstract]@inproceedings { steinke:02:date,
author = {Steinke, Stefan and Wehmeyer, Lars and Lee, Bo-Sik and Marwedel, Peter},
title = {Assigning Program and Data Objects to Scratchpad for Energy Reduction},
booktitle = {DATE 2002},
year = {2002},
address = {Paris/France},
month = {mar},
keywords = {ecc},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2002-date.pdf},
confidential = {n},
abstract = {The number of embedded systems is increasing and a remarkable percentage is designed as mobile applications. For the latter, the energy consumption is a limiting factor because of today's battery capacities. Besides the processor, memory accesses consume a high amount of energy. The use of additional less power hungry memories like caches or scratchpads is thus common. Caches incorporate the hardware control logic for moving data in and out automatically. On the other hand, this logic requires chip area and energy. A scratchpad memory is much more energy efficient, but there is a need for software control of its content. In this paper, an algorithm integrated into a compiler is presented which analyses the application and selects program and data parts which are placed into the scratchpad. Comparisons against a cache solution show remarkable advantages between 12\% and 43\% in energy consumption for designs of the same memory size.},
} The number of embedded systems is increasing and a remarkable percentage is designed as mobile applications. For the latter, the energy consumption is a limiting factor because of today's battery capacities. Besides the processor, memory accesses consume a high amount of energy. The use of additional less power hungry memories like caches or scratchpads is thus common. Caches incorporate the hardware control logic for moving data in and out automatically. On the other hand, this logic requires chip area and energy. A scratchpad memory is much more energy efficient, but there is a need for software control of its content. In this paper, an algorithm integrated into a compiler is presented which analyses the application and selects program and data parts which are placed into the scratchpad. Comparisons against a cache solution show remarkable advantages between 12% and 43% in energy consumption for designs of the same memory size.
|
| Stefan Steinke, Nils Grunwald, Lars Wehmeyer, Rajeshwari Banakar, M. Balakrishnan and Peter Marwedel. Reducing Energy Consumption by Dynamic Copying of Instructions onto Onchip Memory. In ISSS 2002 Kyoto/Japan, October 2002 [BibTeX][PDF][Abstract]@inproceedings { steinke:02:isss,
author = {Steinke, Stefan and Grunwald, Nils and Wehmeyer, Lars and Banakar, Rajeshwari and Balakrishnan, M. and Marwedel, Peter},
title = {Reducing Energy Consumption by Dynamic Copying of Instructions onto Onchip Memory},
booktitle = {ISSS 2002},
year = {2002},
address = {Kyoto/Japan},
month = {oct},
keywords = {ecc},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2002-isss.pdf},
confidential = {n},
abstract = {The number of mobile embedded systems is increasing and all of them are limited in their uptime by their battery capacity. Several hardware changes have been introduced during the last years, but the steadily growing functionality still requires further energy reductions, e.g. by software optimizations. A significant amount of energy can be saved in the memory hierarchy where most of the energy is consumed. In this paper a new software technique is presented which supports the use of an onchip scratchpad memory by dynamically copying program parts into it. The set of selected program parts are determined with an optimal algorithm using integer linear programming. Experimental results show a reduction of the energy consumption by nearly 30\%, a performance increase by 25\% against a common cache system and energy improvements against a static approach of up to 38\%.},
} The number of mobile embedded systems is increasing and all of them are limited in their uptime by their battery capacity. Several hardware changes have been introduced during the last years, but the steadily growing functionality still requires further energy reductions, e.g. by software optimizations. A significant amount of energy can be saved in the memory hierarchy where most of the energy is consumed. In this paper a new software technique is presented which supports the use of an onchip scratchpad memory by dynamically copying program parts into it. The set of selected program parts are determined with an optimal algorithm using integer linear programming. Experimental results show a reduction of the energy consumption by nearly 30%, a performance increase by 25% against a common cache system and energy improvements against a static approach of up to 38%.
|
| Rajeshwari Banakar, Stefan Steinke, Bo-Sik Lee, M. Balakrishnan and Peter Marwedel. Scratchpad Memory : A Design Alternative for Cache On-chip memory in Embedded Systems. In CODES Estes Park (Colorado), May 2002 [BibTeX][PDF][Abstract]@inproceedings { banakar:02:codes,
author = {Banakar, Rajeshwari and Steinke, Stefan and Lee, Bo-Sik and Balakrishnan, M. and Marwedel, Peter},
title = {Scratchpad Memory : A Design Alternative for Cache On-chip memory in Embedded Systems},
booktitle = {CODES},
year = {2002},
address = {Estes Park (Colorado)},
month = {may},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2002-codes.pdf},
confidential = {n},
abstract = {In this paper we address the problem of on-chip memory selection for computationally intensive applications, by proposing scratch pad memory as an alternative to cache. Area and energy for different scratch pad and cache sizes are computed using the CACTI tool while performance was evaluated using the trace results of the simulator. The target processor chosen for evaluation was AT91M40400. The results clearly establish scratchpad memory as a low power alternative in most situations with an average energy reduction of 40\%. Further the average area-time reduction for the scratchpad memory was 46\% of the cache memory.},
} In this paper we address the problem of on-chip memory selection for computationally intensive applications, by proposing scratch pad memory as an alternative to cache. Area and energy for different scratch pad and cache sizes are computed using the CACTI tool while performance was evaluated using the trace results of the simulator. The target processor chosen for evaluation was AT91M40400. The results clearly establish scratchpad memory as a low power alternative in most situations with an average energy reduction of 40%. Further the average area-time reduction for the scratchpad memory was 46% of the cache memory.
|
| Peter Marwedel and Luca Benini. Low-power/low-energy embedded software. In Tutorial at Design, Automation and Test in Europe (DATE) Paris, 2002, March 2002 [BibTeX][Abstract]@inproceedings { marwedel:02:date,
author = {Marwedel, Peter and Benini, Luca},
title = {Low-power/low-energy embedded software},
booktitle = {Tutorial at Design, Automation and Test in Europe (DATE)},
year = {2002},
address = {Paris, 2002},
month = {mar},
confidential = {n},
abstract = {The size of the embedded system market is expected to grow significantly in the next years. For many of these systems, power and energy management are becoming primary issues due to rapidly increasing computational requirement and the slow rate of improvements in battery technology. Hence, researchers and designers have to be aware of approaches for reducing the power and energy consumption of embedded systems. In addition to standard techniques used at the device and the circuit level, new techniques are required which exploit the fact that a major part of the functionality of embedded systems is implemented in software. There are various levels at which the energy consumption of software can be considered, including the high-level algorithm descriptions, compilers, and operating systems. These levels will be covered in the tutorial. Summary: With a focus on programmable embedded systems, this tutorial will: \begin{itemize} \item survey the interaction of architecture, operating systems, compilers and memories from a power/energy focus, \item present specific contributors of each part to power and energy, and \item outline software techniques for minimization of power/energy. \end{itemize} Outline: In the first section, an introduction to the topic will be provided. Due to the large influence of the memory architecture on the total energy consumption, different memory architectures will be presented next. We will show how partitioned memories can help reducing the energy consumption. In the next section, we will describe how partitioned memory architectures and other features of embedded systems can be exploited in compilers. This includes the exploitation of scratch-pad memories and a comparison between scratch-pads and caches. In addition, this includes an analysis of the size of register files. Furthermore, we will explain techniques for reducing the memory traffic by global optimizations designed for multimedia applications. This section also comprises a description of applicable standard compiler optimizations and their potential for energy reductions as well as a brief introduction to compression techniques. The final section describes system software and real-time operating system (RTOS) issues. This will include hardware for RTOS-based power management, software support for power management, power-aware process scheduling and power-aware device management. Exploitation of application-specific information and power management of distributed systems will also be covered. Audience: Researchers starting to work on embedded systems or on low power design techniques; designers interested in getting to know available methodologies for low power software design.},
} The size of the embedded system market is expected to grow significantly in the next years. For many of these systems, power and energy management are becoming primary issues due to rapidly increasing computational requirement and the slow rate of improvements in battery technology. Hence, researchers and designers have to be aware of approaches for reducing the power and energy consumption of embedded systems. In addition to standard techniques used at the device and the circuit level, new techniques are required which exploit the fact that a major part of the functionality of embedded systems is implemented in software. There are various levels at which the energy consumption of software can be considered, including the high-level algorithm descriptions, compilers, and operating systems. These levels will be covered in the tutorial. Summary: With a focus on programmable embedded systems, this tutorial will: itemize \item survey the interaction of architecture, operating systems, compilers and memories from a power/energy focus, \item present specific contributors of each part to power and energy, and \item outline software techniques for minimization of power/energy. itemize Outline: In the first section, an introduction to the topic will be provided. Due to the large influence of the memory architecture on the total energy consumption, different memory architectures will be presented next. We will show how partitioned memories can help reducing the energy consumption. In the next section, we will describe how partitioned memory architectures and other features of embedded systems can be exploited in compilers. This includes the exploitation of scratch-pad memories and a comparison between scratch-pads and caches. In addition, this includes an analysis of the size of register files. Furthermore, we will explain techniques for reducing the memory traffic by global optimizations designed for multimedia applications. This section also comprises a description of applicable standard compiler optimizations and their potential for energy reductions as well as a brief introduction to compression techniques. The final section describes system software and real-time operating system (RTOS) issues. This will include hardware for RTOS-based power management, software support for power management, power-aware process scheduling and power-aware device management. Exploitation of application-specific information and power management of distributed systems will also be covered. Audience: Researchers starting to work on embedded systems or on low power design techniques; designers interested in getting to know available methodologies for low power software design.
|
| Peter Marwedel and Srinivas Devadas (eds.). LCTES'02-SCOPES'02: Joint Conference on Languages, Compilers and Tools for Embedded Systems & Software and Compilers for Embedded Systems. In ACM Press June 2002 [BibTeX][Abstract]@inproceedings { marwedel:02:lctes,
author = {Marwedel, Peter and Devadas (eds.), Srinivas},
title = {LCTES'02-SCOPES'02: Joint Conference on Languages, Compilers and Tools for Embedded Systems \\& Software and Compilers for Embedded Systems},
booktitle = {ACM Press},
year = {2002},
month = {jun},
confidential = {n},
abstract = {(Draft version of the preface, final layout is different): This volume contains the proceedings of the Joint Languages, Compilers, and Tools for Embedded Systems (LCTES'02) and Software and Compilers for Embedded Systems (SCOPES'02) Conference. LCTES/SCOPES'02 took place in Berlin during June 19th to the 21st. For the first time, LCTES and SCOPES were held together, resulting in stimulating contacts between researchers predominantly having a background in programming languages and electronic design automation, respectively. Also, for the very first time, LCTES was held as a conference and not as a workshop. LCTES/SCOPES'02 received a total of 73 papers. During a comprehensive review process a total of 234 reviews were submitted. Finally, 25 papers were accepted and included in the resulting high-quality program. Accepted papers covered the following areas: compilers including low-energy compilation, synthesis, design space exploration, debugging and validation, code generation and register allocation, processor modeling, hardware/software codesign, and real-time scheduling. Accepted papers were grouped into 10 sessions. In addition, two invited keynotes given by Dr. Philippe Magarshack of STMicroelectronics and Prof. Gerhard Fettweis of Systemonic AG and of Dresden University emphasized industrial view points and perspectives. We thank the following members of the program committee for their efforts in reviewing the papers: David August, Shuvra S. Bhattacharyya, Raul Camposano, Keith D. Cooper, Rajiv Gupta, Mary Hall, Seongsoo Hong, Masaharu Imai, Ahmed Jerraya, Jochen Jess, Kurt Keutzer, Rainer Leupers, Annie Liu, Jef van Meerbergen, SangLyul Min, Frank Mueller, Tatsuo Nakajima, Alex Nicolau, Santosh Pande, Manas Saksena, Bob Rau, Wolfgang Rosenstiel, Sreeranga P. Rajan, Gang-Ryung Uh, Carl Von Platen, Bernard Wess, David Whalley, Reinhard Wilhelm, and Hiroto Yasuura. We thank Jens Knoop of the University of Dortmund who served as the Finance Chair, Ahmed Jerraya of TIMA, Grenoble who served as the Publicity Chair for their efforts in organizing LCTES/SCOPES'02 and making it a success. Thanks to Armin Zimmermann of the Technical University of Berlin for organizing the social event in LCTES/SCOPES'02. We acknowledge the sponsorship of ACM SIGPLAN. ACM also provided travel grants to students. The European Design Automation Association (EDAA) provided an in-cooperation status to LCTES/SCOPES. Finally, we thank the steering committee for their efforts in ensuring continuity from the previous LCTES workshops. Peter Marwedel, University of Dortmund \\ Srinivas Devadas, MIT \\ Co-Chairs of LCTES/SCOPES'02},
} (Draft version of the preface, final layout is different): This volume contains the proceedings of the Joint Languages, Compilers, and Tools for Embedded Systems (LCTES'02) and Software and Compilers for Embedded Systems (SCOPES'02) Conference. LCTES/SCOPES'02 took place in Berlin during June 19th to the 21st. For the first time, LCTES and SCOPES were held together, resulting in stimulating contacts between researchers predominantly having a background in programming languages and electronic design automation, respectively. Also, for the very first time, LCTES was held as a conference and not as a workshop. LCTES/SCOPES'02 received a total of 73 papers. During a comprehensive review process a total of 234 reviews were submitted. Finally, 25 papers were accepted and included in the resulting high-quality program. Accepted papers covered the following areas: compilers including low-energy compilation, synthesis, design space exploration, debugging and validation, code generation and register allocation, processor modeling, hardware/software codesign, and real-time scheduling. Accepted papers were grouped into 10 sessions. In addition, two invited keynotes given by Dr. Philippe Magarshack of STMicroelectronics and Prof. Gerhard Fettweis of Systemonic AG and of Dresden University emphasized industrial view points and perspectives. We thank the following members of the program committee for their efforts in reviewing the papers: David August, Shuvra S. Bhattacharyya, Raul Camposano, Keith D. Cooper, Rajiv Gupta, Mary Hall, Seongsoo Hong, Masaharu Imai, Ahmed Jerraya, Jochen Jess, Kurt Keutzer, Rainer Leupers, Annie Liu, Jef van Meerbergen, SangLyul Min, Frank Mueller, Tatsuo Nakajima, Alex Nicolau, Santosh Pande, Manas Saksena, Bob Rau, Wolfgang Rosenstiel, Sreeranga P. Rajan, Gang-Ryung Uh, Carl Von Platen, Bernard Wess, David Whalley, Reinhard Wilhelm, and Hiroto Yasuura. We thank Jens Knoop of the University of Dortmund who served as the Finance Chair, Ahmed Jerraya of TIMA, Grenoble who served as the Publicity Chair for their efforts in organizing LCTES/SCOPES'02 and making it a success. Thanks to Armin Zimmermann of the Technical University of Berlin for organizing the social event in LCTES/SCOPES'02. We acknowledge the sponsorship of ACM SIGPLAN. ACM also provided travel grants to students. The European Design Automation Association (EDAA) provided an in-cooperation status to LCTES/SCOPES. Finally, we thank the steering committee for their efforts in ensuring continuity from the previous LCTES workshops. Peter Marwedel, University of Dortmund \ Srinivas Devadas, MIT \ Co-Chairs of LCTES/SCOPES'02
|
| Peter Marwedel, Khac Dung Cong and Sergej Schwenk. RAVI: Interactive Visualization of information systems dynamics using a Java-based schematic editor and simulator. In Open IFIP-GI-Conference on Social, Ethical and Cognitive Issues of Informatics and ICT (Information and Communication Technologies), SECIII June 2002 [BibTeX][PDF][Abstract]@inproceedings { marwedel:02:seciii,
author = {Marwedel, Peter and Cong, Khac Dung and Schwenk, Sergej},
title = {RAVI: Interactive Visualization of information systems dynamics using a Java-based schematic editor and simulator},
booktitle = {Open IFIP-GI-Conference on Social, Ethical and Cognitive Issues of Informatics and ICT (Information and Communication Technologies), SECIII},
year = {2002},
month = {jun},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2002-seciii.pdf},
confidential = {n},
abstract = {One of the key limitations of traditional media, e.g. books, for teaching is their lack of visualizing the dynamic behavior of systems. Videos tapes and video distribution techniques have made it possible to show non-interactive media elements to students. However, one of the key advantages of multimedia systems over traditional media is their potential of providing interactiveness. This interactiveness, however, is not easy to obtain, since it requires the simulation of the system to be visualized. Designing simulators can be a challenging task that can not be solved within the time-frame usually allocated for multi-media projects. On the other hand, available simulators are not suitable for classroom use. They are frequently designed for optimum simulation speed and complex design projects. Ease of use, excellent visualization and portability have normally not been top goals for simulator design. Also, powerful simulators are typically proprietary and come at high costs, preventing their widespread deployment to class rooms and into the hands of students.},
} One of the key limitations of traditional media, e.g. books, for teaching is their lack of visualizing the dynamic behavior of systems. Videos tapes and video distribution techniques have made it possible to show non-interactive media elements to students. However, one of the key advantages of multimedia systems over traditional media is their potential of providing interactiveness. This interactiveness, however, is not easy to obtain, since it requires the simulation of the system to be visualized. Designing simulators can be a challenging task that can not be solved within the time-frame usually allocated for multi-media projects. On the other hand, available simulators are not suitable for classroom use. They are frequently designed for optimum simulation speed and complex design projects. Ease of use, excellent visualization and portability have normally not been top goals for simulator design. Also, powerful simulators are typically proprietary and come at high costs, preventing their widespread deployment to class rooms and into the hands of students.
|
| Peter Marwedel. Embedded Software: How to make it efficient?. In Proceedings of the Euromicro Symposium on Digital System Design Dortmund, 2002 [BibTeX][PDF][Abstract]@inproceedings { marwedel:02:euromicro,
author = {Marwedel, Peter},
title = {Embedded Software: How to make it efficient?},
booktitle = {Proceedings of the Euromicro Symposium on Digital System Design},
year = {2002},
address = {Dortmund},
publisher = {IEEE Design \& Test},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2002-euromicro.pdf},
confidential = {n},
abstract = {This paper stresses the importance of designing efficient embedded software and it provides a global view of some of the techniques that have been developed to meet this goal. These techniques include high-level transformations, compiler optimizations reducing the energy consumption of embedded programs and optimizations exploiting architectural features of embedded processors. Such optimizations lead to significant reductions of the execution time, the required energy and the memory size of embedded applications. Despite this, they can hardly be found in any available compiler. particular application) and processors and software. The latter is used to meet the flexibility requirements found for almost all of today's applications.},
} This paper stresses the importance of designing efficient embedded software and it provides a global view of some of the techniques that have been developed to meet this goal. These techniques include high-level transformations, compiler optimizations reducing the energy consumption of embedded programs and optimizations exploiting architectural features of embedded processors. Such optimizations lead to significant reductions of the execution time, the required energy and the memory size of embedded applications. Despite this, they can hardly be found in any available compiler. particular application) and processors and software. The latter is used to meet the flexibility requirements found for almost all of today's applications.
|
| Manoj Kumar Jain, Lars Wehmeyer, Stefan Steinke, Peter Marwedel and M. Balakrishnan. Evaluating Register File Size in ASIP Design. In CODES Copenhagen (Denmark), April 2001 [BibTeX][PDF][Abstract]@inproceedings { jain:01:codes,
author = {Jain, Manoj Kumar and Wehmeyer, Lars and Steinke, Stefan and Marwedel, Peter and Balakrishnan, M.},
title = {Evaluating Register File Size in ASIP Design},
booktitle = {CODES},
year = {2001},
address = {Copenhagen (Denmark)},
month = {apr},
keywords = {ecc},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2001-codes.pdf},
confidential = {n},
abstract = {Interest in synthesis of Application Specific Instruction Set Processors or ASIPs has increased considerably and a number of methodologies have been proposed for ASIP design. A key step in ASIP synthesis involves deciding architectural features based on application requirements and constraints. In this paper we observe the effect of changing register file size on the performance as well as power and energy consumption. Detailed data is generated and analyzed for a number of application programs. Results indicate that choice of an appropriate number of registers has a significant impact on performance.},
} Interest in synthesis of Application Specific Instruction Set Processors or ASIPs has increased considerably and a number of methodologies have been proposed for ASIP design. A key step in ASIP synthesis involves deciding architectural features based on application requirements and constraints. In this paper we observe the effect of changing register file size on the performance as well as power and energy consumption. Detailed data is generated and analyzed for a number of application programs. Results indicate that choice of an appropriate number of registers has a significant impact on performance.
|
| Markus Lorenz, Rainer Leupers, Peter Marwedel, Thorsten Dräger and Gerhard P. Fettweis. Low-Energy DSP Code Generation Using a Genetic Algorithm. In ICCD '01 Austin/Texas/USA, September 2001 [BibTeX][PDF][Abstract]@inproceedings { lorenz:01:iccd,
author = {Lorenz, Markus and Leupers, Rainer and Marwedel, Peter and Dr\"ager, Thorsten and Fettweis, Gerhard P.},
title = {Low-Energy DSP Code Generation Using a Genetic Algorithm},
booktitle = {ICCD '01},
year = {2001},
address = {Austin/Texas/USA},
month = {sep},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2001-iccd.pdf},
confidential = {n},
abstract = {This paper deals with low-energy code generation for a highly optimized digital signal processor designed for mobile communication applications. We present a genetic algorithm based code generator (GCG), and an instruction-level power model for this processor. Our code generator is capable of reducing the power dissipation of target applications by means of two techniques: First, GCG minimizes the number of memory accesses by using a special list-scheduling algorithm. This technique makes it possible to perform graph based code selection and to take into account the high interdependencies of the subtasks of code generation by phase coupling. In addition, GCG optimizes the scheduling of processor instructions with respect to the instruction-level power model based on a gate level simulation. Experimental results for several benchmarks show the effectiveness of our approach.},
} This paper deals with low-energy code generation for a highly optimized digital signal processor designed for mobile communication applications. We present a genetic algorithm based code generator (GCG), and an instruction-level power model for this processor. Our code generator is capable of reducing the power dissipation of target applications by means of two techniques: First, GCG minimizes the number of memory accesses by using a special list-scheduling algorithm. This technique makes it possible to perform graph based code selection and to take into account the high interdependencies of the subtasks of code generation by phase coupling. In addition, GCG optimizes the scheduling of processor instructions with respect to the instruction-level power model based on a gate level simulation. Experimental results for several benchmarks show the effectiveness of our approach.
|
| Markus Lorenz, David Kottmann, Steven Bashford, Rainer Leupers and Peter Marwedel. Optimized Address Assignment for DSPs with SIMD Memory Accesses. In ASP-DAC '01 Yokohama/Japan, January 2001 [BibTeX][PDF][Abstract]@inproceedings { lorenz:01:aspdac,
author = {Lorenz, Markus and Kottmann, David and Bashford, Steven and Leupers, Rainer and Marwedel, Peter},
title = {Optimized Address Assignment for DSPs with SIMD Memory Accesses},
booktitle = {ASP-DAC '01},
year = {2001},
address = {Yokohama/Japan},
month = {jan},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2001-aspdac.pdf},
confidential = {n},
abstract = {This paper deals with address assignment in code generation for digital signal processors (DSPs) with SIMD (single instruction multiple data) memory accesses. In these processors data are organized in groups (or partitions), whose elements share one common memory address. In order to optimize program performance for processors with such memory architectures it is important to have a suitable memory layout of the variables. We propose a two-step address assignment technique for scalar variables using a genetic algorithm based partitioning method and a graph based heuristic which makes use of available DSP address generation hardware. We show that our address assignment techniques lead to a significant code quality improvement compared to heuristics.},
} This paper deals with address assignment in code generation for digital signal processors (DSPs) with SIMD (single instruction multiple data) memory accesses. In these processors data are organized in groups (or partitions), whose elements share one common memory address. In order to optimize program performance for processors with such memory architectures it is important to have a suitable memory layout of the variables. We propose a two-step address assignment technique for scalar variables using a genetic algorithm based partitioning method and a graph based heuristic which makes use of available DSP address generation hardware. We show that our address assignment techniques lead to a significant code quality improvement compared to heuristics.
|
| Peter Marwedel, Stefan Steinke and Lars Wehmeyer. Compilation techniques for energy-, code-size-, and run-time-efficient embedded software. In Int. Workshop on Advanced Compiler Techniques for High Performance and Embedded Processors Bucharest, Hungary, 2001 [BibTeX][Abstract]@inproceedings { marwedel:01:iwact,
author = {Marwedel, Peter and Steinke, Stefan and Wehmeyer, Lars},
title = {Compilation techniques for energy-, code-size-, and run-time-efficient embedded software},
booktitle = {Int. Workshop on Advanced Compiler Techniques for High Performance and Embedded Processors},
year = {2001},
address = {Bucharest, Hungary},
keywords = {ecc},
confidential = {n},
abstract = {This paper is motivated by two essential characteristics of embedded systems: the increasing amount of software that is used for implementing embedded systems and the need for implementing embedded systems efficiently. As a consequence, embedded software has to be efficient. In the following, we will present techniques for generating efficient machine code for architectures which are typically found in embedded systems. We will demonstrate, using examples, how compilers for embedded processors can exploit features that are found in embedded processors.},
} This paper is motivated by two essential characteristics of embedded systems: the increasing amount of software that is used for implementing embedded systems and the need for implementing embedded systems efficiently. As a consequence, embedded software has to be efficient. In the following, we will present techniques for generating efficient machine code for architectures which are typically found in embedded systems. We will demonstrate, using examples, how compilers for embedded processors can exploit features that are found in embedded processors.
|
| Stefan Steinke, Markus Knauer, Lars Wehmeyer and Peter Marwedel. An Accurate and Fine Grain Instruction-Level Energy Model Supporting Software Optimizations. In PATMOS Yverdon (Switzerland), September 2001 [BibTeX][PDF][Abstract]@inproceedings { steinke:01:patmos,
author = {Steinke, Stefan and Knauer, Markus and Wehmeyer, Lars and Marwedel, Peter},
title = {An Accurate and Fine Grain Instruction-Level Energy Model Supporting Software Optimizations},
booktitle = {PATMOS},
year = {2001},
address = {Yverdon (Switzerland)},
month = {sep},
keywords = {ecc},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/2001-patmos.pdf},
confidential = {n},
abstract = {Power aware compilers have been under research during the last few years. However, there is still a need for accurate energy models for supporting software optimizations. In this paper we present a new energy model on the instruction level. As an addition to former models, the bit toggling on internal and external buses as well as accesses to off-chip memories are considered. To determine the characteristics, a measuring method is presented which can be used to establish the energy model without detailed knowledge of the internal processor structures. Finally, the proposed energy model is established for the ARM7TDMI RISC processor.},
} Power aware compilers have been under research during the last few years. However, there is still a need for accurate energy models for supporting software optimizations. In this paper we present a new energy model on the instruction level. As an addition to former models, the bit toggling on internal and external buses as well as accesses to off-chip memories are considered. To determine the characteristics, a measuring method is presented which can be used to establish the energy model without detailed knowledge of the internal processor structures. Finally, the proposed energy model is established for the ARM7TDMI RISC processor.
|
| Peter Marwedel. Compilation Techniques for Embedded Software. In Invited Tutorial, Asia and South Pacific Design Automation Conference (ASP-DAC) Yokohama, Japan, 2001 [BibTeX][Abstract]@inproceedings { marwedel:01:aspdac,
author = {Marwedel, Peter},
title = {Compilation Techniques for Embedded Software},
booktitle = {Invited Tutorial, Asia and South Pacific Design Automation Conference (ASP-DAC)},
year = {2001},
address = {Yokohama, Japan},
confidential = {n},
abstract = {Embedded systems demand for efficient processor architectures, optimized for application domains or applications. Current compiler technology supports these architectures poorly and has been recognized as a bottleneck for designing systems. Recent research projects aim at removing this bottleneck. We will present code optimization approaches taking the characteristics of embedded processor architectures into account. The following topics will be included: \begin{itemize} \item memory allocation techniques, \item compiler techniques for VLIW machines, \item compiler techniques aiming at energy minimization of embedded software, \item retargetability of compilers, \item code compression techniques. \end{itemize}},
} Embedded systems demand for efficient processor architectures, optimized for application domains or applications. Current compiler technology supports these architectures poorly and has been recognized as a bottleneck for designing systems. Recent research projects aim at removing this bottleneck. We will present code optimization approaches taking the characteristics of embedded processor architectures into account. The following topics will be included: itemize \item memory allocation techniques, \item compiler techniques for VLIW machines, \item compiler techniques aiming at energy minimization of embedded software, \item retargetability of compilers, \item code compression techniques. itemize
|
| Matthias Weiss, Gerhard Fettweis, Markus Lorenz, Rainer Leupers and Peter Marwedel. Toolumgebung fuer plattformbasierte DSPs der naechsten Generation. In DSP Deutschland Munich/Germany, September 1999 [BibTeX][PDF][Abstract]@inproceedings { weiss:1999:dsp,
author = {Weiss, Matthias and Fettweis, Gerhard and Lorenz, Markus and Leupers, Rainer and Marwedel, Peter},
title = {Toolumgebung fuer plattformbasierte DSPs der naechsten Generation},
booktitle = {DSP Deutschland},
year = {1999},
address = {Munich/Germany},
month = {sep},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1999-dspd.pdf},
confidential = {n},
abstract = {Digitale Signalprozessoren (DSPs) werden ueberall dort eingesetzt, wo bestimmte Performance- oder Aufwandsanforderungen nicht von Standardprozessoren erfuellt werden koennen. Beispiele sind Media- und Sprachcodecs, Modems, sowie Bild- und Spracherkennungen. Aufgrund der unterschiedlichen Performanceanforderungen dieser Anwendungen beginnen DSP-Hersteller nun, nicht einen DSP sondern eine Plattformloesung anzukuendigen. Beispiele sind der StarCore von Motorola/Lucent oder der TigerSharc von Analog Devices. Am Lehrstuhl Mobile Nachrichtensysteme der TU Dresden ist hierbei die M3-DSP Plattform entstanden. Eine Programmierumgebung fuer eine solche DSP-Plattform muss im Gegensatz zu konventionellen Ansaetzen auch die Anforderungen der Plattform mit unterstuetzen. In diesem Paper stellen wir die Architektur einer solchen Toolumgebung fuer die M3-Plattform vor und demonstrieren die Anwendbarkeit anhand von Beispielen.},
} Digitale Signalprozessoren (DSPs) werden ueberall dort eingesetzt, wo bestimmte Performance- oder Aufwandsanforderungen nicht von Standardprozessoren erfuellt werden koennen. Beispiele sind Media- und Sprachcodecs, Modems, sowie Bild- und Spracherkennungen. Aufgrund der unterschiedlichen Performanceanforderungen dieser Anwendungen beginnen DSP-Hersteller nun, nicht einen DSP sondern eine Plattformloesung anzukuendigen. Beispiele sind der StarCore von Motorola/Lucent oder der TigerSharc von Analog Devices. Am Lehrstuhl Mobile Nachrichtensysteme der TU Dresden ist hierbei die M3-DSP Plattform entstanden. Eine Programmierumgebung fuer eine solche DSP-Plattform muss im Gegensatz zu konventionellen Ansaetzen auch die Anforderungen der Plattform mit unterstuetzen. In diesem Paper stellen wir die Architektur einer solchen Toolumgebung fuer die M3-Plattform vor und demonstrieren die Anwendbarkeit anhand von Beispielen.
|
| U. Bieker, M. Kaibel, P. Marwedel and W. Geisselhardt. STAR-DUST: Hierarchical Test of Embedded Processors by Self-Test Programs. In European Test Workshop (ETW) June 1999 [BibTeX][PDF][Abstract]@inproceedings { bieker:1999:etw,
author = {Bieker, U. and Kaibel, M. and Marwedel, P. and Geisselhardt, W.},
title = {STAR-DUST: Hierarchical Test of Embedded Processors by Self-Test Programs},
booktitle = {European Test Workshop (ETW)},
year = {1999},
month = {jun},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1999-etw.pdf},
confidential = {n},
abstract = {This paper describes the hierarchical test-generation method STAR-DUST, using self-test program generator RESTART, test pattern generator DUST, fault simulator FAUST and SYNOPSYS logic synthesis tools. RESTART aims at supporting self-test of embedded processors. Its integration into the STAR-DUST environment allows test program generation for realistic fault assumptions and provides, for the first time, experimental data on the fault coverage that can be obtained for full processor models. Experimental data shows that fault masking is not a problem even though the considered processor has to perform result comparison and arithmetic operations in the same ALU.},
} This paper describes the hierarchical test-generation method STAR-DUST, using self-test program generator RESTART, test pattern generator DUST, fault simulator FAUST and SYNOPSYS logic synthesis tools. RESTART aims at supporting self-test of embedded processors. Its integration into the STAR-DUST environment allows test program generation for realistic fault assumptions and provides, for the first time, experimental data on the fault coverage that can be obtained for full processor models. Experimental data shows that fault masking is not a problem even though the considered processor has to perform result comparison and arithmetic operations in the same ALU.
|
| Sujit Dey Rajesh K. Gupta and Peter Marwedel. Embedded System Design and Validation: Building Systems from IC Cores to Chips (Tutorial). In Eleventh Int. conf on VLSI Design 1998 [BibTeX][Abstract]@inproceedings { gupta:1998:vlsi,
author = {Rajesh K. Gupta, Sujit Dey and Marwedel, Peter},
title = {Embedded System Design and Validation: Building Systems from IC Cores to Chips (Tutorial)},
booktitle = {Eleventh Int. conf on VLSI Design},
year = {1998},
confidential = {n},
abstract = {This tutorial addresses the challenges in the design and validation of an embedded system-on-a-chip. In particular, we examine the design and use of pre-designed, pre-characterized, and pre-verified silicon building blocks called `cores.' We examine design styles for portability and reuse of core cells, and how these can be validated for given applications. We examine the trend in IC cores by describing a wide range of available processor, DSP, interface and analog cores and how these are used in application domains such as multimedia and networkings systems. Covered topics include emulation, in-circuit emulation, compliance test environments, instruction-set simulation, and hardware-software co-simulation. We discuss the increasing software content on chips, made possible by proliferation of processor and DSP cores. We examine application-specific instruction-set (ASIP) and signal processors (ASSPs) and their use in specific application domains. We then present new code optimization approaches related to memory allocation and code compaction that exploit the embedded processor architectures. Finally, we present techniques for retargeting compilers to new architectures easily. In particular, we show how compilers can be generated from descriptions of processor architectures.},
} This tutorial addresses the challenges in the design and validation of an embedded system-on-a-chip. In particular, we examine the design and use of pre-designed, pre-characterized, and pre-verified silicon building blocks called `cores.' We examine design styles for portability and reuse of core cells, and how these can be validated for given applications. We examine the trend in IC cores by describing a wide range of available processor, DSP, interface and analog cores and how these are used in application domains such as multimedia and networkings systems. Covered topics include emulation, in-circuit emulation, compliance test environments, instruction-set simulation, and hardware-software co-simulation. We discuss the increasing software content on chips, made possible by proliferation of processor and DSP cores. We examine application-specific instruction-set (ASIP) and signal processors (ASSPs) and their use in specific application domains. We then present new code optimization approaches related to memory allocation and code compaction that exploit the embedded processor architectures. Finally, we present techniques for retargeting compilers to new architectures easily. In particular, we show how compilers can be generated from descriptions of processor architectures.
|
| Ralf Niemann and Peter Marwedel. Synthesis of Communicating Controllers for Concurrent Hardware/Software Systems. In Design, Automation and Test in Europe (DATE) Paris/France, February 1998 [BibTeX][PDF][Abstract]@inproceedings { niemann:1998:date,
author = {Niemann, Ralf and Marwedel, Peter},
title = {Synthesis of Communicating Controllers for Concurrent Hardware/Software Systems},
booktitle = {Design, Automation and Test in Europe (DATE)},
year = {1998},
address = {Paris/France},
month = {feb},
keywords = {hwsw},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1998-date-niemann.pdf},
confidential = {n},
abstract = {Two main aspects in hardware/software codesign are hardware/software partitioning and co-synthesis. Most codesign approaches work only on one of these problems. In this paper, an approach coupling hardware/software partitioning and co-synthesis will be presented, working fully-automatic. The techniques have been integrated in the codesign tool COOL (COdesign toOL) supporting the complete design flow from system specification to board-level implementation for multi-processor and multi-ASIC target architectures for data-flow dominated applications.},
} Two main aspects in hardware/software codesign are hardware/software partitioning and co-synthesis. Most codesign approaches work only on one of these problems. In this paper, an approach coupling hardware/software partitioning and co-synthesis will be presented, working fully-automatic. The techniques have been integrated in the codesign tool COOL (COdesign toOL) supporting the complete design flow from system specification to board-level implementation for multi-processor and multi-ASIC target architectures for data-flow dominated applications.
|
| Rainer Leupers and Peter Marwedel. Formale Methoden in der Codeerzeugung für digitale Signalprozessoren. In 5. GI/ITG/GMM Workshop "Methoden des Entwurfs und der Verifikation digitaler Systeme" Linz/Austria, April 1997 [BibTeX][PDF][Abstract]@inproceedings { eupers:1997:itg,
author = {Leupers, Rainer and Marwedel, Peter},
title = {Formale Methoden in der Codeerzeugung f\"ur digitale Signalprozessoren},
booktitle = {5. GI/ITG/GMM Workshop "Methoden des Entwurfs und der Verifikation digitaler Systeme"},
year = {1997},
address = {Linz/Austria},
month = {apr},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1997-gi-itg.pdf},
confidential = {n},
abstract = {Der Bereich HW/SW-Codesign f\"ur eingebettete Systeme umfa\"st neben Methoden zur HW/SW-Partitionierung und Hardwaresynthese notwendigerweise auch Techniken zur Codeerzeugung f\"ur eingebettete programmierbare Prozessoren. Speziell im Falle von digitalen Signalprozessoren (DSPs) ist die Qualit\"at verf\"ugbarer Compiler unzureichend. Zur Vermeidung von aufwendiger Programmierung auf Assemblerebene sind daher neue DSP-spezifische Codeerzeugungstechiken notwendig. Dieser Beitrag stellt den Compiler RECORD vor, welcher f\"ur eine Klasse von DSPs Hochsprachenprogramme in Maschinencode \"ubersetzt. Um den speziellen Anforderungen an Compiler f\"ur DSPs gerecht zu werden, werden teilweise formale Methoden eingesetzt. Wir stellen zwei solche f\"ur RECORD entwickelte Methoden vor, welche zur Analyse von Prozessormodellen sowie zur Code-Kompaktierung verwendet werden, und diskutieren deren praktische Anwendung.},
} Der Bereich HW/SW-Codesign für eingebettete Systeme umfaßt neben Methoden zur HW/SW-Partitionierung und Hardwaresynthese notwendigerweise auch Techniken zur Codeerzeugung für eingebettete programmierbare Prozessoren. Speziell im Falle von digitalen Signalprozessoren (DSPs) ist die Qualität verfügbarer Compiler unzureichend. Zur Vermeidung von aufwendiger Programmierung auf Assemblerebene sind daher neue DSP-spezifische Codeerzeugungstechiken notwendig. Dieser Beitrag stellt den Compiler RECORD vor, welcher für eine Klasse von DSPs Hochsprachenprogramme in Maschinencode übersetzt. Um den speziellen Anforderungen an Compiler für DSPs gerecht zu werden, werden teilweise formale Methoden eingesetzt. Wir stellen zwei solche für RECORD entwickelte Methoden vor, welche zur Analyse von Prozessormodellen sowie zur Code-Kompaktierung verwendet werden, und diskutieren deren praktische Anwendung.
|
| Rainer Leupers and Peter Marwedel. Optimierende Compiler für DSPs: Was ist verfügbar ? (in German). In DSP Deutschland '97 Munich/Germany, October 1997 [BibTeX][PDF][Abstract]@inproceedings { leupers:1997:dsp,
author = {Leupers, Rainer and Marwedel, Peter},
title = {Optimierende Compiler f\"ur DSPs: Was ist verf\"ugbar ? (in German)},
booktitle = {DSP Deutschland '97},
year = {1997},
address = {Munich/Germany},
month = {oct},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1997-dsp-deutschland.pdf},
confidential = {n},
abstract = {Die Softwareentwicklung f\"ur eingebettete Prozessoren findet heute gr\"o\"stenteils noch auf Assemblerebene statt. Der Grund f\"ur diesen langfristig wohl unhaltbaren Zustand liegt in der mangelnden Verf\"ugbarkeit von guten C-Compilern. In den letzten Jahren wurden allerdings wesentliche Fortschritte in der Codeoptimierung - speziell f\"ur DSPs - erzielt, welche bisher nur unzureichend in kommerzielle Produkte umgesetzt wurden. Dieser Beitrag zeigt die prinzipiellen Optimierungsquellen auf und fa\"st den Stand der Technik zusammen. Die zentralen Methoden hierbei sind komplexe Optimierungsverfahren, welche \"uber die traditionelle Compilertechnologie hinausgehen, sowie die Ausnutzung der DSP-spezifischen Hardware-Architekturen zur effizienten \"Ubersetzung von C-Sprachkonstrukten in DSP-Maschinenbefehle. Die genannten Verfahren lassen sich teilweise auch allgemein auf (durch Compiler generierte oder handgeschriebene) Assemblerprogramme anwenden.},
} Die Softwareentwicklung für eingebettete Prozessoren findet heute größtenteils noch auf Assemblerebene statt. Der Grund für diesen langfristig wohl unhaltbaren Zustand liegt in der mangelnden Verfügbarkeit von guten C-Compilern. In den letzten Jahren wurden allerdings wesentliche Fortschritte in der Codeoptimierung - speziell für DSPs - erzielt, welche bisher nur unzureichend in kommerzielle Produkte umgesetzt wurden. Dieser Beitrag zeigt die prinzipiellen Optimierungsquellen auf und faßt den Stand der Technik zusammen. Die zentralen Methoden hierbei sind komplexe Optimierungsverfahren, welche über die traditionelle Compilertechnologie hinausgehen, sowie die Ausnutzung der DSP-spezifischen Hardware-Architekturen zur effizienten \"Ubersetzung von C-Sprachkonstrukten in DSP-Maschinenbefehle. Die genannten Verfahren lassen sich teilweise auch allgemein auf (durch Compiler generierte oder handgeschriebene) Assemblerprogramme anwenden.
|
| Rainer Leupers and Peter Marwedel. Retargetable Generation of Code Selectors from HDL Processor Models. In European Design & Test Conference (ED & TC) Paris/France, March 1997 [BibTeX][PDF][Abstract]@inproceedings { leupers:1997:edtc,
author = {Leupers, Rainer and Marwedel, Peter},
title = {Retargetable Generation of Code Selectors from HDL Processor Models},
booktitle = {European Design \& Test Conference (ED \& TC)},
year = {1997},
address = {Paris/France},
month = {mar},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1997-edtc.pdf},
confidential = {n},
abstract = {Besides high code quality,a primary issue in embedded code generation is retargetability of code generators. This paper presents techniques for automatic generation of code selectors from externally specified processor models. In contrast to previous work, our retargetable compiler RECORD does not require tool-specific modelling formalisms, but starts from general HDL processor models. From an HDL model, all processor aspects needed for code generation are automatically derived. As demonstrated by experimental results, short turnaround times for retargeting are achieved, which permits to study the HW/SW trade-off between processor architectures and program execution speed.},
} Besides high code quality,a primary issue in embedded code generation is retargetability of code generators. This paper presents techniques for automatic generation of code selectors from externally specified processor models. In contrast to previous work, our retargetable compiler RECORD does not require tool-specific modelling formalisms, but starts from general HDL processor models. From an HDL model, all processor aspects needed for code generation are automatically derived. As demonstrated by experimental results, short turnaround times for retargeting are achieved, which permits to study the HW/SW trade-off between processor architectures and program execution speed.
|
| Peter Marwedel. Code Generation for Core Processors. In Invited embedded tutorial, 34th Design Automation Conference 1997 [BibTeX][PDF][Abstract]@inproceedings { marwedel:1997:dac,
author = {Marwedel, Peter},
title = {Code Generation for Core Processors},
booktitle = {Invited embedded tutorial, 34th Design Automation Conference},
year = {1997},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1997-f_dac.pdf},
confidential = {n},
abstract = {This tutorial responds to the rapidly increasing use of cores in general and of processor cores in particular for implementing systems-on-a-chip. In the first part of this text, we will provide a brief introduction to various cores. Applications can be found in most segments of the embedded systems market. These applications demand for extreme efficiency, and in particular for efficient processor architectures and for efficient embedded software. In the second part of this text, we will show that current compilers do not provide the required efficiency and we will give an overview over new compiler optimization techniques, which aim at making assembly language programming for embedded software obsolete. These new techniques take advantage of the special characteristics of embedded software and embedded architectures. Due to efficiency considerations, processor architectures optimized for application domains or even for particular applications are of interest. This results in a large number of architectures and instruction sets, leading to the requirement for retargeting compilers to those numerous architectures. In the final section of the tutorial, we will present techniques for retargeting compilers to new architectures easily. We will show, how compilers can be generated from descriptions of processors. One of the approaches closes the gap which so far existed between electronic CAD and compiler generation.},
} This tutorial responds to the rapidly increasing use of cores in general and of processor cores in particular for implementing systems-on-a-chip. In the first part of this text, we will provide a brief introduction to various cores. Applications can be found in most segments of the embedded systems market. These applications demand for extreme efficiency, and in particular for efficient processor architectures and for efficient embedded software. In the second part of this text, we will show that current compilers do not provide the required efficiency and we will give an overview over new compiler optimization techniques, which aim at making assembly language programming for embedded software obsolete. These new techniques take advantage of the special characteristics of embedded software and embedded architectures. Due to efficiency considerations, processor architectures optimized for application domains or even for particular applications are of interest. This results in a large number of architectures and instruction sets, leading to the requirement for retargeting compilers to those numerous architectures. In the final section of the tutorial, we will present techniques for retargeting compilers to new architectures easily. We will show, how compilers can be generated from descriptions of processors. One of the approaches closes the gap which so far existed between electronic CAD and compiler generation.
|
| Peter Marwedel. Processor-Core Based Design and Test - Invited Embedded Tutorial. In Asia and South Pacific Design Automation Conference 1998 Automation Tokyo, Japan, 1997 [BibTeX][PDF][Abstract]@inproceedings { marwedel:1997:tut,
author = {Marwedel, Peter},
title = {Processor-Core Based Design and Test - Invited Embedded Tutorial},
booktitle = {Asia and South Pacific Design Automation Conference 1998 Automation},
year = {1997},
address = {Tokyo, Japan},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1997-aspdactut.pdf},
confidential = {n},
abstract = {This tutorial responds to the rapidly increasing use of various cores for implementing systems-on-a-chip. It specifically focusses on processor cores. We will give some examples of cores, including DSP cores and application-specific instruction-set processors (ASIPs). We will mention market trends for these components, and we will touch design procedures, in particular the use compilers. Finally, we will discuss the problem of testing core-based designs. Existing solutions include boundary scan, embedded in-circuit emulation (ICE), the use of processor resources for stimuli/response compaction and self-test programs.},
} This tutorial responds to the rapidly increasing use of various cores for implementing systems-on-a-chip. It specifically focusses on processor cores. We will give some examples of cores, including DSP cores and application-specific instruction-set processors (ASIPs). We will mention market trends for these components, and we will touch design procedures, in particular the use compilers. Finally, we will discuss the problem of testing core-based designs. Existing solutions include boundary scan, embedded in-circuit emulation (ICE), the use of processor resources for stimuli/response compaction and self-test programs.
|
| Birger Landwehr, Peter Marwedel, Ingolf Markhof and Rainer Dömer. Exploiting Isomorphism for Speeding-Up Instance-Binding in an Integrated Scheduling, Allocation and Assignment Approach to Architectural Synthesis. In Conference on Computer Hardware Description Languages and their Applications Toledo, Spain, April 1997 [BibTeX][PDF][Abstract]@inproceedings { landwehr:1997:chdl,
author = {Landwehr, Birger and Marwedel, Peter and Markhof, Ingolf and D\"omer, Rainer},
title = {Exploiting Isomorphism for Speeding-Up Instance-Binding in an Integrated Scheduling, Allocation and Assignment Approach to Architectural Synthesis},
booktitle = {Conference on Computer Hardware Description Languages and their Applications},
year = {1997},
address = {Toledo, Spain},
month = {apr},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1997-chdl.pdf},
confidential = {n},
abstract = {Register-Transfer (RT-) level netlists are said to be isomorphic if they can be made identical by relabeling RT-components. RT-netlists can be generated by architectural synthesis. In order to consider just the essential design decisions, architectural synthesis should consider only a single representative of sets of isomorphic netlists. In this paper, we are using netlist isomorphism for the very first time in architectural synthesis. Furthermore, we describe how an integer-programming (IP-) based synthesis technique can be extended to take advantage of netlist isomorphism.},
} Register-Transfer (RT-) level netlists are said to be isomorphic if they can be made identical by relabeling RT-components. RT-netlists can be generated by architectural synthesis. In order to consider just the essential design decisions, architectural synthesis should consider only a single representative of sets of isomorphic netlists. In this paper, we are using netlist isomorphism for the very first time in architectural synthesis. Furthermore, we describe how an integer-programming (IP-) based synthesis technique can be extended to take advantage of netlist isomorphism.
|
| Rainer Leupers and Peter Marwedel. Retargetable Compilers for Embedded DSPs. In 7th European Multimedia, Microprocessor Systems and Electronic Commerce Conference (EMMSEC) Florence/Italy, November 1997 [BibTeX][PDF][Abstract]@inproceedings { leupers:1997:emmsec,
author = {Leupers, Rainer and Marwedel, Peter},
title = {Retargetable Compilers for Embedded DSPs},
booktitle = {7th European Multimedia, Microprocessor Systems and Electronic Commerce Conference (EMMSEC)},
year = {1997},
address = {Florence/Italy},
month = {nov},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1997-emmsec.pdf},
confidential = {n},
abstract = {Programmable devices are a key technology for the design of embedded systems, such as in the consumer electronics market. Processor cores are used as building blocks for more and more embedded system designs, since they provide a unique combination of features: flexibility and reusability. Processor-based design implies that compilers capable of generating efficient machine code are necessary. However, highly efficient compilers for embedded processors are hardly available. In particular, this holds for digital signal processors (DSPs). This contribution is intended to outline different aspects of DSP compiler technology. First, we cover demands on compilers for embedded DSPs, which are partially in sharp contrast to traditional compiler construction. Secondly, we present recent advances in DSP code optimization techniques, which explore a comparatively large search space in order to achieve high code quality. Finally, we discuss the different approaches to retargetability of compilers, that is, techniques for automatic generation of compilers from processor models.},
} Programmable devices are a key technology for the design of embedded systems, such as in the consumer electronics market. Processor cores are used as building blocks for more and more embedded system designs, since they provide a unique combination of features: flexibility and reusability. Processor-based design implies that compilers capable of generating efficient machine code are necessary. However, highly efficient compilers for embedded processors are hardly available. In particular, this holds for digital signal processors (DSPs). This contribution is intended to outline different aspects of DSP compiler technology. First, we cover demands on compilers for embedded DSPs, which are partially in sharp contrast to traditional compiler construction. Secondly, we present recent advances in DSP code optimization techniques, which explore a comparatively large search space in order to achieve high code quality. Finally, we discuss the different approaches to retargetability of compilers, that is, techniques for automatic generation of compilers from processor models.
|
| Peter Marwedel. Compilers for Embedded Processors. In Invited Embedded Tutorial SASIMI, Osaka, 1997 [BibTeX][PDF][Abstract]@inproceedings { marwedel:1997:sasimi,
author = {Marwedel, Peter},
title = {Compilers for Embedded Processors},
booktitle = {Invited Embedded Tutorial},
year = {1997},
address = {SASIMI, Osaka},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1997-sasimi.pdf},
confidential = {n},
abstract = {This talk responds to the rapidly increasing use of embedded processors for implementing systems. Such processors come in the form of discrete processors as well as in the form of core processors. They are available both from vendors and within system companies. Applications can be found in most segments of the embedded system market, such as automotive electronics and telecommunications. These applications demand for extremely efficient processor architectures, optimized for a certain application domain or even a certain application. Current compiler technology supports these architectures very poorly and has recently been recognized as a major bottleneck for designing systems quickly, efficiently and reliably. A number of recent research projects aim at removing this bottleneck. The talk will briefly discuss the trend towards embedded processors. We will show market trends and examples of recent embedded processors. We will also introduce the terms "application specific instruction-set processors" (ASIPs), "application-specific signal processors" (ASSPs), "soft cores" and "hard cores". We will then present new code optimization approaches taking the special characterstics of embedded processor architectures into account. In particular, we will present new memory allocation and code compaction algorithms. In the final section of the talk, we will present techniques for retargeting compilers to new architectures easily. These techniques are motivated by the need for domain- or application-dependent optimizations of processor architectures. The scope for such optimizations should not be restricted to hardware architectures but has to include the corresponding work on compilers as well. We will show, how compilers can be generated from descriptions of processor architectures. Presented techniques aim at bridging the gap between electronic CAD and compiler generation. },
} This talk responds to the rapidly increasing use of embedded processors for implementing systems. Such processors come in the form of discrete processors as well as in the form of core processors. They are available both from vendors and within system companies. Applications can be found in most segments of the embedded system market, such as automotive electronics and telecommunications. These applications demand for extremely efficient processor architectures, optimized for a certain application domain or even a certain application. Current compiler technology supports these architectures very poorly and has recently been recognized as a major bottleneck for designing systems quickly, efficiently and reliably. A number of recent research projects aim at removing this bottleneck. The talk will briefly discuss the trend towards embedded processors. We will show market trends and examples of recent embedded processors. We will also introduce the terms "application specific instruction-set processors" (ASIPs), "application-specific signal processors" (ASSPs), "soft cores" and "hard cores". We will then present new code optimization approaches taking the special characterstics of embedded processor architectures into account. In particular, we will present new memory allocation and code compaction algorithms. In the final section of the talk, we will present techniques for retargeting compilers to new architectures easily. These techniques are motivated by the need for domain- or application-dependent optimizations of processor architectures. The scope for such optimizations should not be restricted to hardware architectures but has to include the corresponding work on compilers as well. We will show, how compilers can be generated from descriptions of processor architectures. Presented techniques aim at bridging the gap between electronic CAD and compiler generation. |
| Rainer Leupers and Peter Marwedel. Instruction-Set Modelling for ASIP Code Generation. In 9th Int. Conference on VLSI Design Bangalore/India, January 1996 [BibTeX][PDF][Abstract]@inproceedings { leupers:1996:vlsi,
author = {Leupers, Rainer and Marwedel, Peter},
title = {Instruction-Set Modelling for ASIP Code Generation},
booktitle = {9th Int. Conference on VLSI Design},
year = {1996},
address = {Bangalore/India},
month = {jan},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1996-vlsi-design.pdf},
confidential = {n},
abstract = {A main objective in code generation for ASIPs is to develop retargetable compilers in order to permit exploration of different architectural alternatives within short turnaround time. Retargetability requires that the compiler is supplied with a formal description of the target processor. This description is usually transformed into an internal instruction set model, on which the actual code generation operates. In this contribution we analyze the demands on instruction set models for retargetable code generation, and we present a formal instruction set model which meets these demands. Compared to previous work, it covers a broad range of instruction formats and includes a detailed view of inter-instruction restrictions.},
} A main objective in code generation for ASIPs is to develop retargetable compilers in order to permit exploration of different architectural alternatives within short turnaround time. Retargetability requires that the compiler is supplied with a formal description of the target processor. This description is usually transformed into an internal instruction set model, on which the actual code generation operates. In this contribution we analyze the demands on instruction set models for retargetable code generation, and we present a formal instruction set model which meets these demands. Compared to previous work, it covers a broad range of instruction formats and includes a detailed view of inter-instruction restrictions.
|
| Ralf Niemann and Peter Marwedel. Hardware/Software Partitioning using Integer Programming. In Proc. European Design & Test Conference 1996 [BibTeX][PDF][Abstract]@inproceedings { niemann:1996:edtc,
author = {Niemann, Ralf and Marwedel, Peter},
title = {Hardware/Software Partitioning using Integer Programming},
booktitle = {Proc. European Design \\& Test Conference},
year = {1996},
keywords = {hwsw},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1996-edtc.pdf},
confidential = {n},
abstract = {One of the key problems in hardware/software codesign is hardware/software partitioning. This paper describes a new approach to hardware/software partitioning using integer programming (IP). The advantage of using IP is that optimal results are calculated respective to the chosen objective function. The partitioning approach works fully automatic and supports multi-processor systems, interfacing and hardware sharing. In contrast to other approaches where special estimators are used, we use compilation and synthesis tools for cost estimation. The increased time for calculating the cost metrics is compensated by an improved quality of the estimations compared to the results of estimators. Therefore, fewer iteration steps of partitioning will be needed. The paper will show that using integer programming to solve the hardware/software partitioning problem is feasible and leads to promising results.},
} One of the key problems in hardware/software codesign is hardware/software partitioning. This paper describes a new approach to hardware/software partitioning using integer programming (IP). The advantage of using IP is that optimal results are calculated respective to the chosen objective function. The partitioning approach works fully automatic and supports multi-processor systems, interfacing and hardware sharing. In contrast to other approaches where special estimators are used, we use compilation and synthesis tools for cost estimation. The increased time for calculating the cost metrics is compensated by an improved quality of the estimations compared to the results of estimators. Therefore, fewer iteration steps of partitioning will be needed. The paper will show that using integer programming to solve the hardware/software partitioning problem is feasible and leads to promising results.
|
| Rainer Leupers and Peter Marwedel. Algorithms for Address Assignment in DSP Code Generation. In Int. Conf. on Computer-Aided Design (ICCAD) San Jose, November 1996 [BibTeX][PDF][Abstract]@inproceedings { leupers:1996:iccad,
author = {Leupers, Rainer and Marwedel, Peter},
title = {Algorithms for Address Assignment in DSP Code Generation},
booktitle = {Int. Conf. on Computer-Aided Design (ICCAD)},
year = {1996},
address = {San Jose},
month = {nov},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1996-iccad.pdf},
confidential = {n},
abstract = {This paper presents DSP code optimization techniques, which originate from dedicated memory address generation hardware. We define a generic model of DSP address generation units. Based on this model, we present efficient heuristics for computing memory layouts for program variables, which optimize utilization of parallel address generation units. Improvements and generalizations of previous work are described, and the efficacy of the proposed algorithms is demonstrated through experimental evaluation.},
} This paper presents DSP code optimization techniques, which originate from dedicated memory address generation hardware. We define a generic model of DSP address generation units. Based on this model, we present efficient heuristics for computing memory layouts for program variables, which optimize utilization of parallel address generation units. Improvements and generalizations of previous work are described, and the efficacy of the proposed algorithms is demonstrated through experimental evaluation.
|
| Rainer Leupers and Peter Marwedel. Instruction Selection for Embedded DSPs with Complex Instructions. In European Design Automation Conference (EURO-DAC) Geneva/Switzerland, September 1996 [BibTeX][PDF][Abstract]@inproceedings { leupers:1996:eurodac,
author = {Leupers, Rainer and Marwedel, Peter},
title = {Instruction Selection for Embedded DSPs with Complex Instructions},
booktitle = {European Design Automation Conference (EURO-DAC)},
year = {1996},
address = {Geneva/Switzerland},
month = {sep},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1996-eurodac.pdf},
confidential = {n},
abstract = {We address the problem of instruction selection in code generation for embedded digital signal processors. Recent work has shown that this task can be efficiently solved by tree covering with dynamic programming, even in combination with the task of register allocation. However, performing instruction selection by tree covering only does not exploit available instruction-level parallelism, for instance in form of multiply-accumulate instructions or parallel data moves. In this paper we investigate how such complex instructions may affect detection of optimal tree covers, and we present a two-phase scheme for instruction selection which exploits available instruction-level parallelism. At the expense of higher compilation time, this technique may significantly increase the code quality compared to previous work, which is demonstrated for a widespread DSP.},
} We address the problem of instruction selection in code generation for embedded digital signal processors. Recent work has shown that this task can be efficiently solved by tree covering with dynamic programming, even in combination with the task of register allocation. However, performing instruction selection by tree covering only does not exploit available instruction-level parallelism, for instance in form of multiply-accumulate instructions or parallel data moves. In this paper we investigate how such complex instructions may affect detection of optimal tree covers, and we present a two-phase scheme for instruction selection which exploits available instruction-level parallelism. At the expense of higher compilation time, this technique may significantly increase the code quality compared to previous work, which is demonstrated for a widespread DSP.
|
| Rainer Leupers and Peter Marwedel. A BDD-based Frontend for Retargetable Compilers. In Proc. European Design & Test Conference 1995 [BibTeX][PDF][Abstract]@inproceedings { leupers:1995:edtc,
author = {Leupers, Rainer and Marwedel, Peter},
title = {A BDD-based Frontend for Retargetable Compilers},
booktitle = {Proc. European Design \& Test Conference},
year = {1995},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1995-edtc.pdf},
confidential = {n},
abstract = {In this paper we present a unified frontend for retargetable compilers that performs analysis of the target processor model. Our approach bridges the gap between structural and behavioral processor models for retargetable compilation. This is achieved by means of instruction set extraction. The extraction technique is based on a BDD data structure which significantly improves control signal analysis in the target processor compared to previous approaches.},
} In this paper we present a unified frontend for retargetable compilers that performs analysis of the target processor model. Our approach bridges the gap between structural and behavioral processor models for retargetable compilation. This is achieved by means of instruction set extraction. The extraction technique is based on a BDD data structure which significantly improves control signal analysis in the target processor compared to previous approaches.
|
| Rainer Leupers and Peter Marwedel. Time-constrained Code Compaction for DSPs. In Int. Symp. on System Synthesis (ISSS) September 1995 [BibTeX][PDF][Abstract]@inproceedings { leupers:1995:isss,
author = {Leupers, Rainer and Marwedel, Peter},
title = {Time-constrained Code Compaction for DSPs},
booktitle = {Int. Symp. on System Synthesis (ISSS)},
year = {1995},
month = {sep},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1995-isss.pdf},
confidential = {n},
abstract = {DSP algorithms in most cases are subject to hard real-time constraints. In case of programmable DSP processors, meeting those constraints must be ensured by appropriate code generation techniques. For processors offering instruction-level parallelism, the task of code generation includes code compaction. The exact timing behavior of a DSP program is only known after compaction. Therefore, real-time constraints should be taken into account during the compaction phase. While most known DSP code generators rely on rigid heuristics for that phase, this paper proposes a novel approach to local code compaction based on an Integer Programming model, which obeys exact timing constraints. Due to a general problem formulation, the model also obeys encoding restrictions and possible side effects.},
} DSP algorithms in most cases are subject to hard real-time constraints. In case of programmable DSP processors, meeting those constraints must be ensured by appropriate code generation techniques. For processors offering instruction-level parallelism, the task of code generation includes code compaction. The exact timing behavior of a DSP program is only known after compaction. Therefore, real-time constraints should be taken into account during the compaction phase. While most known DSP code generators rely on rigid heuristics for that phase, this paper proposes a novel approach to local code compaction based on an Integer Programming model, which obeys exact timing constraints. Due to a general problem formulation, the model also obeys encoding restrictions and possible side effects.
|
| Rainer Leupers and Peter Marwedel. Using Compilers for Heterogeneous System Design. In Int. Conf. on Parallel Architectures and Compilation Techniques (PACT) June 1995 [BibTeX][PDF][Abstract]@inproceedings { leupers:1995:pact,
author = {Leupers, Rainer and Marwedel, Peter},
title = {Using Compilers for Heterogeneous System Design},
booktitle = {Int. Conf. on Parallel Architectures and Compilation Techniques (PACT)},
year = {1995},
month = {jun},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1995-pact.pdf},
confidential = {n},
abstract = {Heterogeneous systems combine both data and control processing functions. A programmable DSP core forms the central component. The design of such systems establishes a new application of compilers in electronic CAD: In order to meet given real-time constraints and optimize chip area consumption, the DSP core needs to be customized for each application. In turn, this requires compiler support for evaluating different architectural alternatives. This paper discusses the importance of retargetable compilers in heterogeneous system design.},
} Heterogeneous systems combine both data and control processing functions. A programmable DSP core forms the central component. The design of such systems establishes a new application of compilers in electronic CAD: In order to meet given real-time constraints and optimize chip area consumption, the DSP core needs to be customized for each application. In turn, this requires compiler support for evaluating different architectural alternatives. This paper discusses the importance of retargetable compilers in heterogeneous system design.
|
| Ulrich Bieker and Peter Marwedel. Retargetable self-test program generation using constraint logic programming. In Proceedings of the 32nd annual ACM/IEEE Design Automation Conference, pages 605--611 New York, NY, USA, 1995 [BibTeX][PDF][Link]@inproceedings { Bieker:1995:RSP:217474.217597,
author = {Bieker, Ulrich and Marwedel, Peter},
title = {Retargetable self-test program generation using constraint logic programming},
booktitle = {Proceedings of the 32nd annual ACM/IEEE Design Automation Conference},
year = {1995},
series = {DAC '95},
pages = {605--611},
address = {New York, NY, USA},
publisher = {ACM},
url = {http://doi.acm.org/10.1145/217474.217597},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1995-dac.pdf},
confidential = {n},
} |
| Rainer Leupers, Wolfgang Schenk and Peter Marwedel. Retargetable Assembly Code Generation by Bootstrapping. In Proc. 7th International Symposium on High-Level Synthesis 1994 [BibTeX][PDF][Abstract]@inproceedings { leupers:1994:hlss,
author = {Leupers, Rainer and Schenk, Wolfgang and Marwedel, Peter},
title = {Retargetable Assembly Code Generation by Bootstrapping},
booktitle = {Proc. 7th International Symposium on High-Level Synthesis},
year = {1994},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1994-hlss.pdf},
confidential = {n},
abstract = {In a hardware/software codesign environment compilers are needed that map software components of a partitioned system behavioral description onto a programmable processor. Since the processor structure is not static, but can repeatedly change during the design process, the compiler should be retargetable in order to avoid manual compiler adaption for each alternative architecture. A restriction of existing retargetable compilers is that they only generate microcode for the target architecture instead of machine-level code. In this paper we introduce a bootstrapping technique permitting to translate high-level language (HLL) programs into real machine-level code using a retargetable microcode compiler. Retargetability is preserved, permitting to compare different architectural alternatives in a codesign framework within relatively short time.},
} In a hardware/software codesign environment compilers are needed that map software components of a partitioned system behavioral description onto a programmable processor. Since the processor structure is not static, but can repeatedly change during the design process, the compiler should be retargetable in order to avoid manual compiler adaption for each alternative architecture. A restriction of existing retargetable compilers is that they only generate microcode for the target architecture instead of machine-level code. In this paper we introduce a bootstrapping technique permitting to translate high-level language (HLL) programs into real machine-level code using a retargetable microcode compiler. Retargetability is preserved, permitting to compare different architectural alternatives in a codesign framework within relatively short time.
|
| Rainer Leupers, Ralf Niemann and Peter Marwedel. Methods for Retargetable DSP Code Generation. In IEEE Workshop on VLSI Signal Processing 1994 1994 [BibTeX][PDF][Abstract]@inproceedings { leupers:1994:ieee,
author = {Leupers, Rainer and Niemann, Ralf and Marwedel, Peter},
title = {Methods for Retargetable DSP Code Generation},
booktitle = {IEEE Workshop on VLSI Signal Processing 1994},
year = {1994},
keywords = {hwsw},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1994-IEEE.pdf},
confidential = {n},
abstract = {Efficient embedded DSP system design requires methods of hardware/software codesign. In this contribution we focus on software synthesis for partitioned system behavioral descriptions. In previous approaches, this task is performed by compiling the behavioral descriptions onto standard processors using target-specific compilers. It is argued that abandoning this restriction allows for higher degrees of freedom in design space exploration. In turn, this demands for retargetable code generation tools. We present different schemes for DSP code generation using the MSSQ microcode generator. Experiments with industrial applications revealed that retargetable DSP code generation based on structural hardware descriptions is feasible, but there exists a strong dependency between the behavioral description style and the resulting code quality. As a result, necessary features of high-quality retargetable DSP code generators are identified.},
} Efficient embedded DSP system design requires methods of hardware/software codesign. In this contribution we focus on software synthesis for partitioned system behavioral descriptions. In previous approaches, this task is performed by compiling the behavioral descriptions onto standard processors using target-specific compilers. It is argued that abandoning this restriction allows for higher degrees of freedom in design space exploration. In turn, this demands for retargetable code generation tools. We present different schemes for DSP code generation using the MSSQ microcode generator. Experiments with industrial applications revealed that retargetable DSP code generation based on structural hardware descriptions is feasible, but there exists a strong dependency between the behavioral description style and the resulting code quality. As a result, necessary features of high-quality retargetable DSP code generators are identified.
|
| Wolfgang Schenk Rainer Leupers and Peter Marwedel. Microcode Generation for Flexible Parallel Target Architectures. In IFIP Trans.\ A-50: Parallel Architectures and Compilation Techniques (PACT-94) 1994 [BibTeX][PDF][Abstract]@inproceedings { leupers:1994:pact,
author = {Rainer Leupers, Wolfgang Schenk and Marwedel, Peter},
title = {Microcode Generation for Flexible Parallel Target Architectures},
booktitle = {IFIP Trans.\ A-50: Parallel Architectures and Compilation Techniques (PACT-94)},
year = {1994},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1994-pact.pdf},
confidential = {n},
abstract = {Advanced architectural features of microprocessors like instruction level parallelism and pipelined functional hardware units require code generation techniques beyond the scope of traditional compilers. Additionally, recent design styles in the area of digital signal processing pose a strong demand for retargetable compilation. This paper presents an approach to code generation based on netlist descriptions of the target processor. The basic features of the MSSQ microcode compiler are outlined, and novel techniques for handling complex hardware modules and multi-cycle operations are presented.},
} Advanced architectural features of microprocessors like instruction level parallelism and pipelined functional hardware units require code generation techniques beyond the scope of traditional compilers. Additionally, recent design styles in the area of digital signal processing pose a strong demand for retargetable compilation. This paper presents an approach to code generation based on netlist descriptions of the target processor. The basic features of the MSSQ microcode compiler are outlined, and novel techniques for handling complex hardware modules and multi-cycle operations are presented.
|
| Rainer Leupers and Peter Marwedel. Instruction Set Extraction From Programmable Structures.. In Proc. EURO-DAC 1994 1994 [BibTeX][PDF][Abstract]@inproceedings { leupers:1994:eurodac,
author = {Leupers, Rainer and Marwedel, Peter},
title = {Instruction Set Extraction From Programmable Structures.},
booktitle = {Proc. EURO-DAC 1994},
year = {1994},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1994-eurodac-extract.pdf},
confidential = {n},
abstract = {Due to the demand for more design flexibility and design reuse, ASIPs have emerged as a new important design style in the area of DSP systems. In order to obtain efficient hardware/software partitionings within ASIP-based systems, the designer has to be supported by CAD tools that allow frequent re-mapping of algorithms onto variable programmable target structures. This leads to a new class of design tools: retargetable compilers. Considering existing retargetable compilers based on pattern matching, automatic instruction set extraction is identified as a profitable frontend for those compilers. This paper presents concepts and an implementation of an instruction set extractor.},
} Due to the demand for more design flexibility and design reuse, ASIPs have emerged as a new important design style in the area of DSP systems. In order to obtain efficient hardware/software partitionings within ASIP-based systems, the designer has to be supported by CAD tools that allow frequent re-mapping of algorithms onto variable programmable target structures. This leads to a new class of design tools: retargetable compilers. Considering existing retargetable compilers based on pattern matching, automatic instruction set extraction is identified as a profitable frontend for those compilers. This paper presents concepts and an implementation of an instruction set extractor.
|
| Birger Landwehr and Peter Marwedel. Intelligent library component selection and management in an IP-model based high-level synthesis system. In IFIP Workshop on Logic and Architecture Synthesis Grenoble, 1993 [BibTeX]@inproceedings { landwehr:93:ifip,
author = {Landwehr, Birger and Marwedel, Peter},
title = {Intelligent library component selection and management in an IP-model based high-level synthesis system},
booktitle = {IFIP Workshop on Logic and Architecture Synthesis},
year = {1993},
address = {Grenoble},
confidential = {n},
} |
| Peter Marwedel. Tree-Based Mapping of Algorithms to Predefined Structures. In International Conference on Computer Aided Design (ICCAD) 1993 [BibTeX][PDF][Abstract]@inproceedings { marwedel:1993:iccad,
author = {Marwedel, Peter},
title = {Tree-Based Mapping of Algorithms to Predefined Structures},
booktitle = {International Conference on Computer Aided Design (ICCAD)},
year = {1993},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1993-iccad.pdf},
confidential = {n},
abstract = {Due to the need for fast design cycles and low production cost, programmable targets like DSP processors are becoming increasingly popular. Design planning, detailed design as well as updating such designs requires mapping existing algorithms onto these targets. Instead of writing target-specific mappers, we propose using retargetable mappers. The technique reported in this paper is based on pattern matching. Binary code is generated as a result of this matching process. This paper describes the techniques of our mapper MSSV and identifies areas for improvements. As a result, it shows that efficient handling of alternative mappings is crucial for an acceptable performance.},
} Due to the need for fast design cycles and low production cost, programmable targets like DSP processors are becoming increasingly popular. Design planning, detailed design as well as updating such designs requires mapping existing algorithms onto these targets. Instead of writing target-specific mappers, we propose using retargetable mappers. The technique reported in this paper is based on pattern matching. Binary code is generated as a result of this matching process. This paper describes the techniques of our mapper MSSV and identifies areas for improvements. As a result, it shows that efficient handling of alternative mappings is crucial for an acceptable performance.
|
| C. Albrecht, S. Bashford, P. Marwedel, A. Neumann and W. Schenk. The design of the PRIPS microprocessor. In 4th EUROCHIP-Workshop on VLSI Training 1993 [BibTeX][PDF][Abstract]@inproceedings { albrecht:1993:eurochip,
author = {Albrecht, C. and Bashford, S. and Marwedel, P. and Neumann, A. and Schenk, W.},
title = {The design of the PRIPS microprocessor},
booktitle = {4th EUROCHIP-Workshop on VLSI Training},
year = {1993},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1993-eurochips-albrecht.pdf},
confidential = {n},
abstract = {The PRIPS microprocessor was recently designed at the University of Dortmund, Lehrstuhl
Informatik XII. PRIPS is a coprocessor with an RISC-like instruction set. The supported
data types and some of the instructions are oriented towards supporting the execution of
PROLOG programs. The design was performed by a project group consisting of 10 stu-
dents. Such project groups are one of the key features of the computer science curriculum at
Dortmund. The project group was partitioned into three subgroups. The first subgroup was
responsible for everything related to compiling PROLOG programs into machine code. This
group designed the instruction set of PRIPS. the first approach was to consider implementing
the Warren abstract machine (WAM). This approach was rejected because of the size of the
required microcode. Therefore, it was decided, to implement a RISC-like instruction set with
some special instructions for PROLOG. With this approach, PROLOG-programs are first
compiled into WAM-code. WAM instructions are then expanded into RISC instructions. The
first subgroup also analysed the effect of different options for caches on the performance.
The second subgroup designed the register-transfer structure for the given instruction set. To
this end, semantics of the instruction set was described in the hardware description language
MIMOLA. Using the TODOS high-level synthesis system designed at the Universities of
Kiel and Dortmund, an initial RT-structure was generated. Subsequent improvements were
added by using the retargetable code generator MSSQ (see paper by Marwedel at EDAC-
EUROASIC-93 for a description of the design process). The final register transfer structure
contains a register file of 32 registers of 32 bits and an ALU with very sophisticated concurrent
tag checking modes. In order to achieve maximum flexibility, it was decided to implement an
on-chip loadable microstore. In order to improve testability of the chip, PRIPS uses external
clock generation and a scan-path design for the controller.
The third group entered the PRIPS design into a commercial Cadence EDGE CAD database.
Due to the problems with the EDIF interface, the design was entered using schematics entry.
Final layout was also obtained with EDGE. Several iterations were required to meet design
constraints. The final chip size is 12 by 8 mm for the ES2 1.5μm CMOS process.
PRIPS has been submitted to EUROCHIP for fabrication. After some delay, caused by
undocumented features of format converters, 30 samples were received in February, 1993.
The setup used for testing basically consists of an Hewlett Packard 16500A tester, which is
linked to a Sun workstation and programmed using the TSSI software package. First results
indicate that some of the chips are working. However, detailed results are not available yet.},
} The PRIPS microprocessor was recently designed at the University of Dortmund, Lehrstuhl
Informatik XII. PRIPS is a coprocessor with an RISC-like instruction set. The supported
data types and some of the instructions are oriented towards supporting the execution of
PROLOG programs. The design was performed by a project group consisting of 10 stu-
dents. Such project groups are one of the key features of the computer science curriculum at
Dortmund. The project group was partitioned into three subgroups. The first subgroup was
responsible for everything related to compiling PROLOG programs into machine code. This
group designed the instruction set of PRIPS. the first approach was to consider implementing
the Warren abstract machine (WAM). This approach was rejected because of the size of the
required microcode. Therefore, it was decided, to implement a RISC-like instruction set with
some special instructions for PROLOG. With this approach, PROLOG-programs are first
compiled into WAM-code. WAM instructions are then expanded into RISC instructions. The
first subgroup also analysed the effect of different options for caches on the performance.
The second subgroup designed the register-transfer structure for the given instruction set. To
this end, semantics of the instruction set was described in the hardware description language
MIMOLA. Using the TODOS high-level synthesis system designed at the Universities of
Kiel and Dortmund, an initial RT-structure was generated. Subsequent improvements were
added by using the retargetable code generator MSSQ (see paper by Marwedel at EDAC-
EUROASIC-93 for a description of the design process). The final register transfer structure
contains a register file of 32 registers of 32 bits and an ALU with very sophisticated concurrent
tag checking modes. In order to achieve maximum flexibility, it was decided to implement an
on-chip loadable microstore. In order to improve testability of the chip, PRIPS uses external
clock generation and a scan-path design for the controller.
The third group entered the PRIPS design into a commercial Cadence EDGE CAD database.
Due to the problems with the EDIF interface, the design was entered using schematics entry.
Final layout was also obtained with EDGE. Several iterations were required to meet design
constraints. The final chip size is 12 by 8 mm for the ES2 1.5μm CMOS process.
PRIPS has been submitted to EUROCHIP for fabrication. After some delay, caused by
undocumented features of format converters, 30 samples were received in February, 1993.
The setup used for testing basically consists of an Hewlett Packard 16500A tester, which is
linked to a Sun workstation and programmed using the TSSI software package. First results
indicate that some of the chips are working. However, detailed results are not available yet.
|
| Peter Marwedel. Matching System and Component Behaviour in MIMOLA Synthesis Tools.. In Proc. European Design Automation Conference (EDAC) 1990 [BibTeX][PDF][Abstract]@inproceedings { marwedel:1990:edac,
author = {Marwedel, Peter},
title = {Matching System and Component Behaviour in MIMOLA Synthesis Tools.},
booktitle = {Proc. European Design Automation Conference (EDAC)},
year = {1990},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1990-edac.pdf},
confidential = {n},
abstract = {This paper discusses the selection of available components during high-level synthesis. We stress the importance of describing the behaviour of available components in some language which is readable for the designer. This behaviour is internally represented by implications. This concept is the key for formal reasoning about the component's capabilities. Alternative functions and sequential and concurrent cooperation of components can be easily described.},
} This paper discusses the selection of available components during high-level synthesis. We stress the importance of describing the behaviour of available components in some language which is readable for the designer. This behaviour is internally represented by implications. This concept is the key for formal reasoning about the component's capabilities. Alternative functions and sequential and concurrent cooperation of components can be easily described.
|
| Lothar Nowak and Peter Marwedel. Verification of Hardware Descriptions by Retargetable Code Generation.. In Proceedings of the 26th Design Automation Conference (DAC) 1989 [BibTeX][PDF][Abstract]@inproceedings { nowak:1989:dac,
author = {Nowak, Lothar and Marwedel, Peter},
title = {Verification of Hardware Descriptions by Retargetable Code Generation.},
booktitle = {Proceedings of the 26th Design Automation Conference (DAC)},
year = {1989},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1989-dac.pdf},
confidential = {n},
abstract = {This paper proposes a new method for hardware verification. The basic idea is the application of a retargetable compiler as verification tool. A retargetable compiler is able to compile programs into the machine code of a specified hardware (target). If the program is the complete behavioural specification of the target, the compiler can be used to verify that a properly programmed structure implements the behaviour. Methods, algorithms and applications of an existing retargetable compiler are described.},
} This paper proposes a new method for hardware verification. The basic idea is the application of a retargetable compiler as verification tool. A retargetable compiler is able to compile programs into the machine code of a specified hardware (target). If the program is the complete behavioural specification of the target, the compiler can be used to verify that a properly programmed structure implements the behaviour. Methods, algorithms and applications of an existing retargetable compiler are described.
|
| M. Balakrishnan and Peter Marwedel. Integrated Scheduling and Binding: A Synthesis Approach for Design Space Exploration. In Proceedings of the 26th Design Automation Conference (DAC) 1989 [BibTeX][PDF][Abstract]@inproceedings { balakrishnan:1989:dac,
author = {Balakrishnan, M. and Marwedel, Peter},
title = {Integrated Scheduling and Binding: A Synthesis Approach for Design Space Exploration},
booktitle = {Proceedings of the 26th Design Automation Conference (DAC)},
year = {1989},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1989-dac.pdf},
confidential = {n},
abstract = {Synthesis of digital systems, involves a number of tasks ranging from scheduling to generating interconnections. The interrelationship between these tasks implies that good designs can only be generated by considering the overall impact of a design decision. The approach presented in this paper provides a framework for integrating scheduling decisions with binding decisions. The methodology supports allocation of a wider mix of operator modules and covers the design space more effectively. The process itself can be described as incremental synthesis and is thus wellsuited for applications involving partial pre-synthesized structures.},
} Synthesis of digital systems, involves a number of tasks ranging from scheduling to generating interconnections. The interrelationship between these tasks implies that good designs can only be generated by considering the overall impact of a design decision. The approach presented in this paper provides a framework for integrating scheduling decisions with binding decisions. The methodology supports allocation of a wider mix of operator modules and covers the design space more effectively. The process itself can be described as incremental synthesis and is thus wellsuited for applications involving partial pre-synthesized structures.
|
| Lothar Nowak and Peter Marwedel. A retargetable microcode-compiler and its application in design verification and architecture evaluation (in German: Ein retargierbarer Mikrocode-Compiler und seine Anwendung in Entwurfsverifikation und Architekturbewertung). In 18. GI Jahrestagung, pages 233-245 Hamburg, 1988, OCR errors are possible [BibTeX][PDF][Abstract]@inproceedings { nowak:1988:gi,
author = {Nowak, Lothar and Marwedel, Peter},
title = {A retargetable microcode-compiler and its application in design verification and architecture evaluation (in German: Ein retargierbarer Mikrocode-Compiler und seine Anwendung in Entwurfsverifikation und Architekturbewertung)},
booktitle = {18. GI Jahrestagung},
year = {1988},
pages = {233-245},
address = {Hamburg},
note = {OCR errors are possible},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1988-GI-Tagung.pdf},
confidential = {n},
abstract = {Die vorliegende Arbeit beschreibt einen retargierbaren Compiler. Ein solcher Compiler ist in der Lage f\"ur verschiedenen Zielmaschinen (Targets) Maschinencode zu erzeugen. Einsatzgebiete dieses Compilers sind vor allem: die Codeerzeugung f\"ur anwendungsspezifische Spezialprozessoren (speziell auch f\"ur Maschinen gro\"ser Befehlswortbreite), die Verifikation einer manuell erzeugten Hardwarestruktur gegen\"uber einer Verhaltensspezifikation und der Vergleich verschiedener Architekturkonzepte untereinander. Wesentliche Vorteile des Compilers ergeben sich aus der Tatsache, da\"s bereits dem Hardware-Designer eine Compiler-Unterst\"utzung zur Verf\"ugung steht. Schnittstellen-Probleme zwischen Compiler- und Hardware-Entwicklung entfallen, da f\"ur beide die gleiche Maschinenbeschreibung benutzt wird. So k\"onnen beim Architekturentwurf verschiedene Konzepte hinsichtlich Laufzeit und Codedichte l\"angerer Anwendungsprogramme verglichen werden.},
} Die vorliegende Arbeit beschreibt einen retargierbaren Compiler. Ein solcher Compiler ist in der Lage für verschiedenen Zielmaschinen (Targets) Maschinencode zu erzeugen. Einsatzgebiete dieses Compilers sind vor allem: die Codeerzeugung für anwendungsspezifische Spezialprozessoren (speziell auch für Maschinen großer Befehlswortbreite), die Verifikation einer manuell erzeugten Hardwarestruktur gegenüber einer Verhaltensspezifikation und der Vergleich verschiedener Architekturkonzepte untereinander. Wesentliche Vorteile des Compilers ergeben sich aus der Tatsache, daß bereits dem Hardware-Designer eine Compiler-Unterstützung zur Verfügung steht. Schnittstellen-Probleme zwischen Compiler- und Hardware-Entwicklung entfallen, da für beide die gleiche Maschinenbeschreibung benutzt wird. So können beim Architekturentwurf verschiedene Konzepte hinsichtlich Laufzeit und Codedichte längerer Anwendungsprogramme verglichen werden.
|
| Peter Marwedel. A New Synthesis Algorithm for the MIMOLA Software System. In Proceedings of the 23th Design Automation Conference (DAC) June 1986 [BibTeX][PDF][Abstract]@inproceedings { marwedel:1986:dac,
author = {Marwedel, Peter},
title = {A New Synthesis Algorithm for the MIMOLA Software System},
booktitle = {Proceedings of the 23th Design Automation Conference (DAC)},
year = {1986},
month = {jun},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1986-dac.pdf},
confidential = {n},
abstract = {The MIMOLA software system is a system for the design of digital processors. The system includes subsystems for retargetable microcode generation, automatic generation of self-test programs and a synthesis subsystem. This paper describes the synthesis part of the system, which accepts a PASCAL-like, high-level program as specification and produces a register transfer structure. Because of the complexity of this design process, a set of sub-problems is identified and algorithms for their solution are indicated. These algorithms include a flexible statement decomposition, statement scheduling, register assignment, module selection and optimizations of interconnections and instruction word length.},
} The MIMOLA software system is a system for the design of digital processors. The system includes subsystems for retargetable microcode generation, automatic generation of self-test programs and a synthesis subsystem. This paper describes the synthesis part of the system, which accepts a PASCAL-like, high-level program as specification and produces a register transfer structure. Because of the complexity of this design process, a set of sub-problems is identified and algorithms for their solution are indicated. These algorithms include a flexible statement decomposition, statement scheduling, register assignment, module selection and optimizations of interconnections and instruction word length.
|
| Peter Marwedel. The MIMOLA Design System: Tools for the Design of Digital Processors . In Proceedings of the 21th Design Automation Conference (DAC) 1984 [BibTeX][PDF][Abstract]@inproceedings { marwedel:1984:dac,
author = {Marwedel, Peter},
title = {The MIMOLA Design System: Tools for the Design of Digital Processors },
booktitle = {Proceedings of the 21th Design Automation Conference (DAC)},
year = {1984},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1984-dac.pdf},
confidential = {n},
abstract = {The MIMOLA design method is a method for the design of digital processors from a very highlevel bevavioral specification. A key feature of this method is the synthesis of a processor from a description of programs which are expected to be typical for the applications of that processor. Design cycles, in which the designer tries to improve automatically generated hardware structures, are supported by a retargetable microcode generator and by an utilization and performance analyzer. This paper describes the design method, available software tools and some applications.},
} The MIMOLA design method is a method for the design of digital processors from a very highlevel bevavioral specification. A key feature of this method is the synthesis of a processor from a description of programs which are expected to be typical for the applications of that processor. Design cycles, in which the designer tries to improve automatically generated hardware structures, are supported by a retargetable microcode generator and by an utilization and performance analyzer. This paper describes the design method, available software tools and some applications.
|
| Peter Marwedel. The Integrated Design of Computer Systems with MIMOLA. In Proc. 12th annual Gi-conf., Informatik Fachberichte #57 June 1982 [BibTeX][PDF][Abstract]@inproceedings { marwedel:1982:gi,
author = {Marwedel, Peter},
title = {The Integrated Design of Computer Systems with MIMOLA},
booktitle = {Proc. 12th annual Gi-conf., Informatik Fachberichte #57},
year = {1982},
month = {jun},
publisher = {Springer},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1982-gi-Vol57.pdf},
confidential = {n},
abstract = {A design method and the corresponding CAD tools for the design of digital computers from a very high-level functional specification is presented. Computers are specified by a set of application programs, for which they shall be optimized, and a set of boundary conditions for the type and number of hardware resources. The paper describes CAD tools, which synthesize architectures from the specification, evaluate their cost and performance and generate microcode. Several design iterations, in which the designer plays an active role, are used to optimize the design. Two applications of the design method are included in the paper.},
} A design method and the corresponding CAD tools for the design of digital computers from a very high-level functional specification is presented. Computers are specified by a set of application programs, for which they shall be optimized, and a set of boundary conditions for the type and number of hardware resources. The paper describes CAD tools, which synthesize architectures from the specification, evaluate their cost and performance and generate microcode. Several design iterations, in which the designer plays an active role, are used to optimize the design. Two applications of the design method are included in the paper.
|
| Peter Marwedel. The MIMOLA Design System: Detailed Description of the Software System.. In Proceedings of the 16th Design Automation Conference (DAC) 1979 [BibTeX][PDF][Abstract]@inproceedings { marwedel:1979:dac,
author = {Marwedel, Peter},
title = {The MIMOLA Design System: Detailed Description of the Software System.},
booktitle = {Proceedings of the 16th Design Automation Conference (DAC)},
year = {1979},
file = {http://ls12-www.cs.tu-dortmund.de/daes/media/documents/publications/downloads/1979-dac.pdf},
confidential = {n},
abstract = {A software system is described which aids in the design of digital processors with a topdown method. It takes the design language MIMOLA as its input. Input may be either a high level functional or a structural description of the hardware. The output is a description of the hardware on the block level. In the case of a high level functional input, the output contains a listing of the hardware which is necessary to execute the input together with utilization factors of the hardware units. The amount of creatable hardware may be limited in a declaration. If there is not enough hardware to execute the input statements in parallel, the necessary intermediate steps are inserted by the system and a corresponding functional description is presented.},
} A software system is described which aids in the design of digital processors with a topdown method. It takes the design language MIMOLA as its input. Input may be either a high level functional or a structural description of the hardware. The output is a description of the hardware on the block level. In the case of a high level functional input, the output contains a listing of the hardware which is necessary to execute the input together with utilization factors of the hardware units. The amount of creatable hardware may be limited in a declaration. If there is not enough hardware to execute the input statements in parallel, the necessary intermediate steps are inserted by the system and a corresponding functional description is presented.
|