author	 = {Julian Kunkel},
	title	 = {{Using Simulation to Validate Performance of MPI(-IO) Implementations}},
	year	 = {2013},
	month	 = {06},
	booktitle	 = {{Supercomputing}},
	editor	 = {Julian Martin Kunkel and Thomas Ludwig and Hans Werner Meuer},
	publisher	 = {Springer},
	address	 = {Berlin, Heidelberg},
	series	 = {Lecture Notes in Computer Science},
	number	 = {7905},
	pages	 = {181--195},
	conference	 = {ISC 2013},
	location	 = {Leipzig, Germany},
	isbn	 = {978-3-642-38749-4},
	issn	 = {0302-9743},
	doi	 = {},
	abstract	 = {Parallel file systems and MPI implementations aim to exploit available hardware resources in order to achieve optimal performance. Since performance is influenced by many hardware and software factors, achieving optimal performance is a daunting task. For these reasons, optimized communication and I/O algorithms are still subject to research. While complexity of collective MPI operations is discussed in literature sometimes, theoretic assessment of the measurements is de facto non-existent. Instead, conducted analysis is typically limited to performance comparisons to previous algorithms. However, observable performance is not only determined by the quality of an algorithm. At run-time performance could be degraded due to unexpected implementation issues and triggered hardware and software exceptions. By applying a model that resembles the system, simulation allows us to estimate the performance. With this approach, the non-function requirement for performance of an implementation can be validated and run-time inefficiencies can be localized. In this paper we demonstrate how simulation can be applied to assess observed performance of collective MPI calls and parallel IO. PIOsimHD, an event-driven simulator, is applied to validate observed performance on our 10 node cluster. The simulator replays recorded application activity and point-to-point operations of collective operations. It also offers the option to record trace files for visual comparison to recorded behavior. With the innovative introspection into behavior, several bottlenecks in system and implementation are localized.},