author	 = {Nabeeh Jumah and Julian Kunkel},
	title	 = {{Optimizing Memory Bandwidth Efficiency with User-Preferred Kernel Merge}},
	year	 = {2020},
	month	 = {05},
	booktitle	 = {{Euro-Par 2019: Parallel Processing Workshops}},
	editor	 = {Ulrich Schwardmann and Christian Boehme and Dora B. Heras and Valeria Cardellini and Emmanuel Jeannot and Antonio Salis and Claudio Schifanella and Ravi Reddy Manumachu and Dieter Schwamborn and Laura Ricci and Oh Sangyoon and Thomas Gruber and Laura Antonelli and Stephen L. Scott},
	publisher	 = {Springer},
	series	 = {Lecture Notes in Computer Science},
	number	 = {11997},
	pages	 = {69-81},
	conference	 = {COLOC - Workshop on Data Locality},
	location	 = {Göttingen, Germany},
	isbn	 = {978-3-030-48340-1},
	issn	 = {1611-3349},
	doi	 = {},
	abstract	 = {Earth system modeling computations use stencils extensively while running many kernels. Optimal coding of the stencils is essential to efficiently use memory bandwidth of an underlying hardware. This is important as stencil computations are memory bound.  Even when the code within one kernel is written to optimally use the memory bandwidth, there could be still opportunities to further do some optimization at the inter-kernel level. Stencils naturally exhibit data locality, and executing a sequence of stencils within separate kernels could waste caching capabilities. Merging the kernels allows to improve the use of the caches.  Some tools were developed to automatically fuse loops instead of the manual optimization. However, scientists still apply fusion in different levels of loop nests manually to find optimal performance. To allow scientists to still apply loop fusions equal to manual loop fusion, we develop a technique to automatically analyse the code and allow scientists to apply there preferred fusions without doing the effort of dependency analysis and code transformation. Our work is done using GGDML language extensions which enables performance portability over different architectures using a single source code.},