author	 = {Nabeeh Jumah and Julian Kunkel},
	title	 = {{Scalable Parallelization of Stencils using MODA}},
	year	 = {2019},
	month	 = {07},
	booktitle	 = {{High Performance Computing: ISC High Performance 2019 International Workshops, Frankfurt/Main, Germany, June 20, 2019, Revised Selected Papers}},
	editor	 = {},
	publisher	 = {Springer},
	series	 = {Lecture Notes in Computer Science},
	conference	 = {P^3MA workshop, ISC HPC},
	location	 = {Frankfurt, Germany},
	isbn	 = {to appear},
	issn	 = {1611-3349},
	abstract	 = {The natural and the design limitations of the evolution of processors, e.g., frequency scaling and memory bandwidth bottlenecks, push towards scaling applications on multiple-node configurations besides to exploiting the power of each single node. This introduced new challenges to porting applications to the new infrastructure, especially with the heterogeneous environments. Domain decomposition and handling the resulting necessary communication is not a trivial task. Parallelizing code automatically cannot be decided by tools in general as a result of the semantics of the general-purpose languages. To allow scientists to avoid such problems, we introduce the Memory-Oblivious Data Access (MODA) technique, and use it to scale code to configurations ranging from a single node to multiple nodes, supporting different architectures, without requiring changes in the source code of the application. We present a technique to automatically identify necessary communication based on higher-level semantics. The extracted information enables tools to generate code that handles the communication. A prototype is developed to implement the techniques and used to evaluate the approach. The results show the effectiveness of using the techniques to scale code on multi-core processors and on GPU based machines. Comparing the ratios of the achieved GFLOPS to the number of nodes in each run, and repeating that on different numbers of nodes shows that the achieved scaling efficiency is around 100\%. This was repeated with up to 100 nodes. An exception to this is the single-node configuration using a GPU, in which no communication is needed, and hence, no data movement between GPU and host memory is needed, which yields higher GFLOPS.},