BibTeX

@article{ICIKL16,
	author	 = {Julian Kunkel and Jakob Lüttgau},
	title	 = {{Interaktiver C-Programmierkurs, ICP}},
	year	 = {2016},
	month	 = {11},
	journal	 = {Synergie, Fachmagazin für Digitalisierung in der Lehre},
	number	 = {2},
	pages	 = {74--75},
	abstract	 = {Programmiersprachen bilden die Basis für die automatisierte Datenverarbeitung in der digitalen Welt. Obwohl die Grundkonzepte einfach zu verstehen sind, beherrscht nur ein geringer Anteil von Personen diese Werkzeuge. Die Gründe hierfür sind Defizite in der Ausbildung und die hohe Einstiegshürde bei der Bereitstellung einer produktiven Programmierumgebung. Insbesondere erfordert das Erlernen einer Programmiersprache die praktische Anwendung der Sprache, vergleichbar mit dem Erlernen einer Fremdsprache. Ziel des Projekts ist die Erstellung eines interaktiven Kurses für die Lehre der Programmiersprache C. Die Interaktivität und das angebotene automatische Feedback sind an den Bedürfnissen der Teilnehmerinnen und Teilnehmer orientiert und bieten die Möglichkeit, autodidaktisch Kenntnisse auf- und auszubauen. Die Lektionen beinhalten sowohl die Einführung in spezifische Teilthemen als auch anspruchsvollere Aufgaben, welche die akademischen Problemlösefähigkeiten fördern. Damit werden unterschiedliche akademische Zielgruppen bedient und aus verschieden Bereichen der Zivilgesellschaft an die Informatik herangeführt. Der in diesem Projekt entwickelte Programmierkurs und die Plattform zur Programmierung können weltweit frei genutzt werden, und der Quellcode bzw. die Lektionen stehen unter Open-Source-Lizenzen und können deshalb beliebig auf die individuellen Bedürfnisse angepasst werden. Dies ermöglicht insbesondere das Mitmachen und Besteuern von neuen Lektionen zur Plattform.},
	url	 = {https://uhh.de/cp3i1},
}

@inbook{HTFBDPNK20,
	author	 = {Philipp Neumann and Julian Kunkel},
	title	 = {{High-Performance Techniques for Big Data Processing}},
	year	 = {2020},
	month	 = {04},
	booktitle	 = {{Knowledge Discovery in Big Data from Astronomy and Earth Observation}},
	publisher	 = {Elsevier},
	address	 = {3251 Riverport Lane, St. Louis, Missouri 63043},
	pages	 = {137--158},
	isbn	 = {978-0-12-819154-5},
}

@inbook{ICIKL17,
	author	 = {Julian Kunkel and Jakob Lüttgau},
	title	 = {{Interaktiver C-Programmierkurs, ICP}},
	year	 = {2017},
	month	 = {04},
	booktitle	 = {{HOOU Content Projekte der Vorprojektphase 2015/16 -- Sonderband zum Fachmagazin Synergie}},
	publisher	 = {Universität Hamburg},
	address	 = {Universität Hamburg, Mittelweg 177, 20148 Hamburg},
	pages	 = {182--186},
	isbn	 = {978-3-924330-57-6},
	abstract	 = {Programmiersprachen bilden die Basis für die automatisierte Datenverarbeitung in der digitalen Welt. Obwohl die Grundkonzepte einfach zu verstehen sind, beherrscht nur ein geringer Anteil von Personen diese Werkzeuge. Die Gründe hierfür sind Defizite in der Ausbildung und die hohe Einstiegshürde bei der Bereitstellung einer produktiven Programmierumgebung. Insbesondere erfordert das Erlernen einer Programmiersprache die praktische Anwendung der Sprache, vergleichbar mit dem Erlernen einer Fremdsprache. Ziel des Projekts ist die Erstellung eines interaktiven Kurses für die Lehre der Programmiersprache C. Die Interaktivität und das angebotene automatische Feedback sind an den Bedürfnissen der Teilnehmerinnen und Teilnehmer orientiert und bieten die Möglichkeit, autodidaktisch Kenntnisse auf- und auszubauen. Die Lektionen beinhalten sowohl die Einführung in spezifische Teilthemen als auch anspruchsvollere Aufgaben, welche die akademischen Problemlösefähigkeiten fördern. Damit werden unterschiedliche akademische Zielgruppen bedient und aus verschieden Bereichen der Zivilgesellschaft an die Informatik herangeführt. Der in diesem Projekt entwickelte Programmierkurs und die Plattform zur Programmierung können weltweit frei genutzt werden, und der Quellcode bzw. die Lektionen stehen unter Open-Source-Lizenzen und können deshalb beliebig auf die individuellen Bedürfnisse angepasst werden. Dies ermöglicht insbesondere das Mitmachen und Besteuern von neuen Lektionen zur Plattform.},
	url	 = {https://www.synergie.uni-hamburg.de/media/sonderbaende/hoou-content-projekte-2015-2016.pdf},
}

@inbook{TETMPCACPM12,
	author	 = {Timo Minartz and Daniel Molka and Julian Kunkel and Michael Knobloch and Michael Kuhn and Thomas Ludwig},
	title	 = {{Tool Environments to Measure Power Consumption and Computational Performance}},
	year	 = {2012},
	booktitle	 = {{Handbook of Energy-Aware and Green Computing}},
	publisher	 = {Chapman and Hall/CRC Press Taylor and Francis Group},
	address	 = {6000 Broken Sound Parkway NW, Boca Raton, FL 33487},
	chapter	 = {31},
	pages	 = {709--743},
	isbn	 = {978-1-4398-5040-4},
}

@article{AACAIMFESK20,
	author	 = {Julian Kunkel and Nabeeh Jumah and Anastasiia Novikova and Thomas Ludwig and Hisashi Yashiro and Naoya Maruyama and Mohamed Wahib and John Thuburn},
	title	 = {{AIMES: Advanced Computation and I/O Methods for Earth-System Simulations}},
	year	 = {2020},
	month	 = {07},
	editor	 = {Hans-Joachim Bungartz and Severin Reiz and Benjamin Uekermann and Philipp Neumann and Wolfgang E. Nagel},
	publisher	 = {Springer International Publishing},
	journal	 = {Lecture Notes in Computer Science},
	series	 = {Volume 7, Number 2},
	pages	 = {61-102},
	isbn	 = {978-3-030-47956-5},
	issn	 = {2197-7100},
	doi	 = {https://doi.org/10.1007/978-3-030-47956-5_5},
	abstract	 = {Dealing with extreme scale Earth-system models is challenging from the computer science perspective, as the required computing power and storage capacity are steadily increasing. Scientists perform runs with growing resolution or aggregate results from many similar smaller-scale runs with slightly different initial conditions (the so-called ensemble runs). In the fifth Coupled Model Intercomparison Project (CMIP5), the produced datasets require more than three Petabytes of storage and the compute and storage requirements are increasing significantly for CMIP6. Climate scientists across the globe are developing next-generation models based on improved numerical formulation leading to grids that are discretized in alternative forms such as an icosahedral (geodesic) grid. The developers of these models face similar problems in scaling, maintaining and optimizing code. Performance portability and the maintainability of code are key concerns of scientists as, compared to industry projects, model code is continuously revised and extended to incorporate further levels of detail. This leads to a rapidly growing code base that is rarely refactored. However, code modernization is important to maintain productivity of the scientist working with the code and for utilizing performance provided by modern and future architectures. The need for performance optimization is motivated by the evolution of the parallel architecture landscape from homogeneous flat machines to heterogeneous combinations of processors with deep memory hierarchy. Notably, the rise of many-core, throughput-oriented accelerators, such as GPUs, requires non-trivial code changes at minimum and, even worse, may necessitate a substantial rewrite of the existing codebase. At the same time, the code complexity increases the difficulty for computer scientists and vendors to understand and optimize the code for a given system. Storing the products of climate predictions requires a large storage and archival system which is expensive. Often, scientists restrict the number of scientific variables and write interval to keep the costs balanced. Compression algorithms can reduce the costs significantly but can also increase the scientific yield of simulation runs. In the AIMES project, we addressed the key issues of programmability, computational efficiency and I/O limitations that are common in next-generation icosahedral earth-system models. The project focused on the separation of concerns between domain scientist, computational scientists, and computer scientists.},
}

@article{POIAWICAWK20,
	author	 = {Julian Kunkel and Luciana Pedro},
	title	 = {{Potential of I/O Aware Workflows in Climate and Weather}},
	year	 = {2020},
	month	 = {04},
	editor	 = {Jack Dongarra and Vladimir Voevodin},
	publisher	 = {Publishing Center of South Ural State University},
	address	 = {454080, Lenin prospekt, 76, Chelyabinsk, Russia},
	journal	 = {Supercomputing Frontiers and Innovations},
	series	 = {Volume 7, Number 2},
	pages	 = {35-53},
	issn	 = {2313-8734},
	doi	 = {https://doi.org/10.14529/jsfi200203},
	abstract	 = {The efficient, convenient, and robust execution of data-driven workflows and enhanced data management are essential for productivity in scientific computing. In HPC, the concerns of storage and computing are traditionally separated and optimised independently from each other and the needs of the end-to-end user. However, in complex workflows, this is becoming problematic. These problems are particularly acute in climate and weather workflows, which as well as becoming increasingly complex and exploiting deep storage hierarchies, can involve multiple data centres. The key contributions of this paper are: 1) A sketch of a vision for an integrated data-driven approach, with a discussion of the associated challenges and implications, and 2) An architecture and roadmap consistent with this vision that would allow a seamless integration into current climate and weather workflows as it utilises versions of existing tools (ESDM, Cylc, XIOS, and DDN’s IME). The vision proposed here is built on the belief that workflows composed of data, computing, and communication-intensive tasks should drive interfaces and hardware configurations to better support the programming models. When delivered, this work will increase the opportunity for smarter scheduling of computing by considering storage in heterogeneous storage systems. We illustrate the performance-impact on an example workload using a model built on measured performance data using ESDM at DKRZ.},
}

@article{PAIPOOAGND20,
	author	 = {Tiejun Wang and Zhuang Liu and Julian Kunkel and Changming Zhao},
	title	 = {{Parallelization and I/O Performance Optimization of a Global Nonhydrostatic Dynamical Core using MPI}},
	year	 = {2020},
	month	 = {04},
	editor	 = {},
	publisher	 = {Tech Science Press},
	journal	 = {Computers, Materials and Continua},
	series	 = {Volume 63, Issue 3},
	pages	 = {1399-1413},
	issn	 = {1546-2226},
	doi	 = {https://doi.org/10.32604/cmc.2020.09701},
	abstract	 = {The Global-Regional Integrated forecast System (GRIST) is the nextgeneration weather and climate integrated model dynamic framework developed by  Chinese Academy of Meteorological Sciences. In this paper, we present several changes  made to the global nonhydrostatic dynamical (GND) core, which is part of the ongoing  prototype of GRIST. The changes leveraging MPI and PnetCDF techniques were targeted  at the parallelization and performance optimization to the original serial GND core.  Meanwhile, some sophisticated data structures and interfaces were designed to adjust  flexibly the size of boundary and halo domains according to the variable accuracy in  parallel context. In addition, the I/O performance of PnetCDF decreases as the number of  MPI processes increases in our experimental environment. Especially when the number  exceeds 6000, it caused system-wide outages (SWO). Thus, a grouping solution was  proposed to overcome that issue. Several experiments were carried out on the  supercomputing platform based on Intel x86 CPUs in the National Supercomputing  Center in Wuxi. The results demonstrated that the parallel GND core based on grouping  solution achieves good strong scalability and improves the performance significantly, as  well as avoiding the SWOs.},
}

@article{THCFTAGAHC20,
	author	 = {Julian Kunkel and Weronika Filinger and Christian Meesters and Anja Gerbes},
	title	 = {{The HPC Certification Forum: Toward a Globally Acknowledged HPC Certification}},
	year	 = {2020},
	month	 = {07},
	editor	 = {},
	publisher	 = {IEEE},
	journal	 = {Computing in Science and Engineering},
	series	 = {Volume 22, Issue 4},
	pages	 = {110-114},
	issn	 = {1558-366X},
	doi	 = {https://doi.org/10.1109/MCSE.2020.2996073},
	abstract	 = {The goal of the HPC Certification Forum is to categorize, define, and examine competencies expected from proficient HPC practitioners. The community-led forum is working toward establishing a globally acknowledged HPC certification process, a process that engages with HPC centres to identify gaps in users’ knowledge, and with users to identify the skills required to perform their tasks. In this article, we introduce the forum and summarize the progress made over the last two years. The release of the first officially supported certificate is planned for the second half of 2020. },
	url	 = {https://www.computer.org/csdl/magazine/cs},
}

@article{CHSTTHCFKH20,
	author	 = {Julian Kunkel and Kai Himstedt and Weronika Filinger and Jean-Thomas Acquaviva and Anja Gerbes and Lev Lafayette},
	title	 = {{Contributing HPC Skills to the HPC Certification Forum}},
	year	 = {2020},
	month	 = {01},
	editor	 = {Steven I. Gordon},
	journal	 = {Journal of Computational Science Education},
	series	 = {Volume 11, Issue 1},
	pages	 = {106-107},
	issn	 = {2153-4136},
	doi	 = {https://doi.org/10.22369/issn.2153-4136/11/1/17},
	abstract	 = {The International HPC Certification Program has been officially launched over a year ago at ISC’18 and since then made significant progress in categorising and defining the skills required to proficiently use a variety of HPC systems. The program reached the stage when the support and input from the HPC community is essential. For the certification to be recognised widely, it needs to capture skills required by majority of HPC users, regardless of their level. This cannot be achieved without contributions from the community. This extended abstract briefly presents the current state of the developed Skill Tree and explains how contributors can extend it. In the talk, we focus on the contribution aspects.},
	url	 = {http://jocse.org/issues/11/1/},
}

@article{OYHCFIRKAH20,
	author	 = {Julian Kunkel and Jean-Thomas Acquaviva and Kai Himstedt and Weronika Filinger and Anja Gerbes and Lev Lafayette},
	title	 = {{One Year HPC Certification Forum in Retrospective}},
	year	 = {2020},
	month	 = {01},
	editor	 = {Steven I. Gordon},
	journal	 = {Journal of Computational Science Education},
	series	 = {Volume 11, Issue 1},
	pages	 = {29-35},
	issn	 = {2153-4136},
	doi	 = {https://doi.org/10.22369/issn.2153-4136/11/1/6},
	abstract	 = {The ever-changing nature of HPC has always compelled the HPC community to focus a lot of effort into training of new and existing practitioners. Historically, these efforts were tailored  around a typical group of users possessing, due to their background, a certain set of programming skills. However, as HPC has become more diverse in terms of hardware, software and  the user background, the traditional training approaches became insufficient in addressing training needs of our community. This increasingly complicated HPC landscape makes  development and delivery of new training materials challenging. How should we develop training for users, often coming from non-traditionally HPC disciplines, and only interested in learning a particular set of skills? How can we satisfy their training needs if we don't really understand what these are? It's clear that HPC centres struggle to identify and overcome the  gaps in users' knowledge, while users struggle to identify skills required to perform their tasks.  With the HPC Certification Forum we aim to clearly categorise, define, and examine competencies expected from proficient HPC practitioners.  In this article, we report the status and progress this independent body has made during the first year of its existence. The drafted processes and prototypes are expected to mature into a holistic ecosystem beneficial for all stakeholders in HPC education.},
	url	 = {http://jocse.org/issues/11/1/},
}

@article{IOBASSFEAC19,
	author	 = {Julian Kunkel and Hayk Shoukourian and Reza Heidari and Torsten Wilde},
	title	 = {{Interference of Billing and Scheduling Strategies for Energy and Cost Savings in Modern Data Centers}},
	year	 = {2019},
	month	 = {04},
	editor	 = {Ishfaq Ahmad},
	publisher	 = {Elsevier},
	journal	 = {Sustainable Computing: Informatics and Systems},
	series	 = {Sustainable Computing},
	issn	 = {2210-5379},
	doi	 = {https://doi.org/10.1016/j.suscom.2019.04.003},
	abstract	 = {The high energy consumption of HPC systems is an obstacle for ever-growing systems. Unfortunately, energy consumption does not decrease linearly with reduced workload; therefore, energy conservation techniques have been deployed on various levels which steer the overall system. While the overall saving of energy is useful, the price of energy is not necessarily proportional to the consumption. Particularly with renewable energies, there are occasions in which the price is significantly lower. The potential of saving energy costs when using smart contracts with energy providers is lacking research. In this paper, we conduct an analysis of the potential savings when applying cost-aware schedulers to data center workloads while considering power contracts that allow for dynamic (hourly) pricing. The contributions of this paper are twofold: 1) the theoretic assessment of cost savings; 2) the development of a simulator to replay batch scheduler traces which supports flexible energy cost models and various cost-aware scheduling algorithms. This allows to approximate the energy costs savings of data centers for various scenarios including off-peak and hourly budgeted energy prices as provided by the energy spot market. An evaluation is conducted with four annual job traces from the German Climate Computing Center (DKRZ) and Leibniz Supercomputing Centre (LRZ). The theoretic analysis indicates a cost savings for 4-8\% when shutting down unused client nodes, and 6-20\% with hourly cost models and optimal scheduling. The experimental validation of a practicable scheduler increases the accuracy against the theoretical best case analysis. As expected, a cost-efficient scheduling algorithm that is fed with the information about future energy costs shifts the jobs to the timeslots where the job execution is cheaper and reduces the energy expenditure, yet increases the waiting times of pending jobs. However, the expected savings for this effort are not justifiable compared to the simple strategy of turning off the unused nodes. Additionally, we compare the cost savings to the total costs of ownership showing that smaller systems with on-demand provisioning yield better cost efficiency.},
}

@article{TAHCPKHHSS19,
	author	 = {Julian Kunkel and Kai Himstedt and Nathanael Hübbe and Hinnerk Stüben and Sandra Schröder and Michael Kuhn and Matthias Riebisch and Stephan Olbrich and Thomas Ludwig and Weronika Filinger and Jean-Thomas Acquaviva and Anja Gerbes and Lev Lafayette},
	title	 = {{Towards an HPC Certification Program}},
	year	 = {2019},
	month	 = {01},
	editor	 = {Steven I. Gordon},
	journal	 = {Journal of Computational Science Education},
	series	 = {Volume 10, Issue 1},
	pages	 = {88--89},
	doi	 = {https://doi.org/10.22369/issn.2153-4136/10/1/14},
	abstract	 = {The HPC community has always considered the training of new and existing HPC practitioners to be of high importance to its growth. This diversification of HPC practitioners challenges the traditional training approaches, which are not able to satisfy the specific needs of users, often coming from non-traditionally HPC disciplines, and only interested in learning a particular set of competences. Challenges for HPC centres are to identify and overcome the gaps in users’ knowledge, while users struggle to identify relevant skills. We have developed a first version of an HPC certification program that would clearly categorize, define, and examine competences. Making clear what skills are required of or recommended for a competent HPC user would benefit both the HPC service providers and practitioners. Moreover, it would allow centres to bundle together skills that are most beneficial for specific user roles and scientific domains. From the perspective of content providers, existing training material can be mapped to competences allowing users to quickly identify and learn the skills they require. Finally, the certificates recognized by the whole HPC community simplify inter-comparison of independently offered courses and provide additional incentive for participation.},
	url	 = {http://www.jocse.org/articles/10/1/14/},
}

@article{ASSOITVSKT18,
	author	 = {Raul Torres and Julian Kunkel and Manuel F. Dolz and Thomas Ludwig},
	title	 = {{A Similarity Study of I/O Traces via String Kernels}},
	year	 = {2018},
	month	 = {07},
	editor	 = { and },
	publisher	 = {Springer},
	journal	 = {The Journal of Supercomputing},
	pages	 = {1--13},
	issn	 = {0920-8542},
	doi	 = {https://doi.org/10.1007/s11227-018-2471-x},
	abstract	 = {Understanding I/O for data-intense applications is the foundation for the optimization of these applications. The classification of the applications according to the expressed I/O access pattern eases the analysis. An access pattern can be seen as fingerprint of an application. In this paper, we address the classification of traces. Firstly, we convert them first into a weighted string representation. Due to the fact that string objects can be easily compared using kernel methods, we explore their use for fingerprinting I/O patterns. To improve accuracy, we propose a novel string kernel function called kast2 spectrum kernel. The similarity matrices, obtained after applying the mentioned kernel over a set of examples from a real application, were analyzed using kernel principal component analysis and hierarchical clustering. The evaluation showed that two out of four I/O access pattern groups were completely identified, while the other two groups conformed a single cluster due to the intrinsic similarity of their members. The proposed strategy can be promisingly applied to other similarity problems involving tree-like structured data.},
}

@article{ASOSSFHCLK18,
	author	 = {Jakob Lüttgau and Michael Kuhn and Kira Duwe and Yevhen Alforov and Eugen Betke and Julian Kunkel and Thomas Ludwig},
	title	 = {{A Survey of Storage Systems for High-Performance Computing}},
	year	 = {2018},
	month	 = {04},
	editor	 = {Jack Dongarra and Vladimir Voevodin},
	publisher	 = {Publishing Center of South Ural State University},
	address	 = {454080, Lenin prospekt, 76, Chelyabinsk, Russia},
	journal	 = {Supercomputing Frontiers and Innovations},
	series	 = {Volume 5, Number 1},
	pages	 = {31--58},
	doi	 = {https://doi.org/10.14529/jsfi180103},
	abstract	 = {In current supercomputers, storage is typically provided by parallel distributed file systems for hot data and tape archives for cold data. These file systems are often compatible with local file systems due to their use of the POSIX interface and semantics, which eases development and debugging because applications can easily run both on workstations and supercomputers. There is a wide variety of file systems to choose from, each tuned for different use cases and implementing different optimizations. However, the overall application performance is often held back by I/O bottlenecks due to insufficient performance of file systems or I/O libraries for highly parallel workloads. Performance problems are dealt with using novel storage hardware technologies as well as alternative I/O semantics and interfaces. These approaches have to be integrated into the storage stack seamlessly to make them convenient to use. Upcoming storage systems abandon the traditional POSIX interface and semantics in favor of alternative concepts such as object and key-value storage; moreover, they heavily rely on technologies such as NVM and burst buffers to improve performance. Additional tiers of storage hardware will increase the importance of hierarchical storage management. Many of these changes will be disruptive and require application developers to rethink their approaches to data management and I/O. A thorough understanding of today's storage infrastructures, including their strengths and weaknesses, is crucially important for designing and implementing scalable storage systems suitable for demands of exascale computing.},
	url	 = {http://superfri.org/superfri/article/view/162},
}

@article{UHASMWRTPC17,
	author	 = {Julian Kunkel and Manuel F. Dolz},
	title	 = {{Understanding Hardware and Software Metrics with respect to Power Consumption}},
	year	 = {2017},
	month	 = {11},
	editor	 = {Ishfaq Ahmad},
	publisher	 = {Elsevier},
	journal	 = {Sustainable Computing: Informatics and Systems},
	series	 = {Sustainable Computing},
	issn	 = {2210-5379},
	doi	 = {https://doi.org/10.1016/j.suscom.2017.10.016},
	abstract	 = {Analyzing and understanding energy consumption of applications is an important task which allows researchers to develop novel strategies for optimizing and conserving energy. A typical methodology is to reduce the complexity of real systems and applications by developing a simplified performance model from observed behavior. In the literature, many of these models are known; however, inherent to any simplification is that some measured data cannot be explained well. While analyzing a models accuracy, it is highly important to identify the properties of such prediction errors. Such knowledge can then be used to improve the model or to optimize the benchmarks used for training the model parameters. For such a benchmark suite, it is important that the benchmarks cover all the aspects of system behavior to avoid overfitting of the model for certain scenarios. It is not trivial to identify the overlap between the benchmarks and answer the question if a benchmark causes different hardware behavior. Inspection of all the available hardware and software counters by humans is a tedious task given the large amount of real-time data they produce. In this paper, we utilize statistical techniques to foster understand and investigate hardware counters as potential indicators of energy behavior. We capture hardware and software counters including power with a fixed frequency and analyze the resulting timelines of these measurements. The concepts introduced can be applied to any set of measurements in order to compare them to another set of measurements. We demonstrate how these techniques can aid identifying interesting behavior and significantly reducing the number of features that must be inspected. Next, we propose counters that can potentially be used for building linear models for predicting with a relative accuracy of 3\%. Finally, we validate the completeness of a benchmark suite, from the point of view of using the available architectural components, for generating accurate models.},
	url	 = {http://www.sciencedirect.com/science/article/pii/S2210537916300865},
}

@article{TDTSOCAFQC17,
	author	 = {Julian Kunkel and Anastasiia Novikova and Eugen Betke},
	title	 = {{Towards Decoupling the Selection of Compression Algorithms from Quality Constraints – an Investigation of Lossy Compression Efficiency}},
	year	 = {2017},
	month	 = {12},
	editor	 = {Jack Dongarra and Vladimir Voevodin},
	journal	 = {Supercomputing Frontiers and Innovations},
	series	 = {Volume 4, Number 4},
	pages	 = {17--33},
	doi	 = {https://doi.org/10.14529/jsfi170402},
	abstract	 = {Data intense scientific domains use data compression to reduce the storage space needed. Lossless data compression preserves information accurately but lossy data compression can achieve much higher compression rates depending on the tolerable error margins. There are many ways of defining precision and to exploit this knowledge, therefore, the field of lossy compression is subject to active research. From the perspective of a scientist, the qualitative definition about the implied loss of data precision should only matter.With the Scientific Compression Library (SCIL), we are developing a meta-compressor that allows users to define various quantities for acceptable error and expected performance behavior. The library then picks a suitable chain of algorithms yielding the user's requirements, the ongoing work is a preliminary stage for the design of an adaptive selector. This approach is a crucial step towards a scientifically safe use of much-needed lossy data compression, because it disentangles the tasks of determining scientific characteristics of tolerable noise, from the task of determining an optimal compression strategy. Future algorithms can be used without changing application code. In this paper, we evaluate various lossy compression algorithms for compressing different scientific datasets (Isabel, ECHAM6), and focus on the analysis of synthetically created data that serves as blueprint for many observed datasets. We also briefly describe the available quantities of SCIL to define data precision and introduce two efficient compression algorithms for individual data points. This shows that the best algorithm depends on user settings and data properties.},
	url	 = {http://superfri.org/superfri/article/view/149},
}

@article{GIMLEJKZYD17,
	author	 = {Nabeeh Jumah and Julian Kunkel and Günther Zängl and Hisashi Yashiro and Thomas Dubos and Yann Meurdesoif},
	title	 = {{GGDML: Icosahedral Models Language Extensions}},
	year	 = {2017},
	month	 = {06},
	editor	 = {},
	publisher	 = {Cosmos Scholars Publishing House},
	journal	 = {Journal of Computer Science Technology Updates},
	series	 = {Volume 4, Number 1},
	pages	 = {1--10},
	doi	 = {https://doi.org/10.15379/2410-2938.2017.04.01.01},
	abstract	 = {The optimization opportunities of a code base are not completely exploited by compilers. In fact, there are optimizations that must be done within the source code. Hence, if the code developers skip some details, some performance is lost. Thus, the use of a general-purpose language to develop a performance-demanding software -e.g. climate models- needs more care from the developers. They should take into account hardware details of the target machine.  Besides, writing a high-performance code for one machine will have a lower performance on another one. The developers usually write multiple optimized sections or even code versions for the different target machines. Such codes are complex and hard to maintain.  In this article we introduce a higher-level code development approach, where we develop a set of extensions to the language that is used to write a model’s code. Our extensions form a domain-specific language (DSL) that abstracts domain concepts and leaves the lower level details to a configurable source-to-source translation process.  The purpose of the developed extensions is to support the icosahedral climate/atmospheric model development. We have started with the three icosahedral models: DYNAMICO, ICON, and NICAM. The collaboration with the scientists from the weather/climate sciences enabled agreed-upon extensions. When we have suggested an extension we kept in mind that it represents a higher-level domain-based concept, and that it carries no lower-level details.  The introduced DSL (GGDML- General Grid Definition and Manipulation Language) hides optimization details like memory layout. It reduces code size of a model to less than one third its original size in terms of lines of code. The development costs of a model with GGDML are therefore reduced significantly.},
	url	 = {http://www.cosmosscholars.com/images/JCSTU_V4N1/JCSTU-V4N1A1-Jumah.pdf},
}

@article{ADPUSSIOSF16,
	author	 = {Julian Kunkel},
	title	 = {{Analyzing Data Properties using Statistical Sampling -- Illustrated on Scientific File Formats}},
	year	 = {2016},
	month	 = {10},
	editor	 = {Jack Dongarra and Vladimir Voevodin},
	journal	 = {Supercomputing Frontiers and Innovations},
	series	 = {Volume 3, Number 3},
	pages	 = {19--33},
	doi	 = {https://doi.org/10.14529/jsfi160304},
	abstract	 = {Understanding the characteristics of data stored in data centers helps computer scientists in identifying the most suitable storage infrastructure to deal with these workloads. For example, knowing the relevance of file formats allows optimizing the relevant formats but also helps in a procurement to define benchmarks that cover these formats. Existing studies that investigate performance improvements and techniques for data reduction such as deduplication and compression operate on a subset of data. Some of those studies claim the selected data is representative and scale their result to the scale of the data center. One hurdle of running novel schemes on the complete data is the vast amount of data stored and, thus, the resources required to analyze the complete data set. Even if this would be feasible, the costs for running many of those experiments must be justified. This paper investigates stochastic sampling methods to compute and analyze quantities of interest on file numbers but also on the occupied storage space. It will be demonstrated that on our production system, scanning 1\% of files and data volume is sufficient to deduct conclusions. This speeds up the analysis process and reduces costs of such studies significantly.},
	url	 = {http://superfri.org/superfri/article/view/106},
}

@article{PIPIHUANNS16,
	author	 = {Jan Fabian Schmid and Julian Kunkel},
	title	 = {{Predicting I/O Performance in HPC Using Artificial Neural Networks}},
	year	 = {2016},
	month	 = {10},
	editor	 = {Jack Dongarra and Vladimir Voevodin},
	journal	 = {Supercomputing Frontiers and Innovations},
	series	 = {Volume 3, Number 3},
	pages	 = {34--39},
	doi	 = {https://doi.org/10.14529/jsfi160303},
	abstract	 = {The prediction of file access times is an important part for the modeling of supercomputer's storage systems. These models can be used to develop analysis tools which support the users to integrate efficient I/O behavior. In this paper, we analyze and predict the access times of a Lustre file system from the client perspective. Therefore, we measure file access times in various test series and developed different models for predicting access times.  The evaluation shows that in models utilizing artificial neural networks the average prediciton error is about 30\% smaller than in linear models. A phenomenon in the distribution of file access times is of particular interest: File accesses with identical parameters show several typical access times.The typical access times usually differ by orders of magnitude and can be explained with a different processing of the file accesses in the storage system - an alternative I/O path. We investigate a method to automatically determine the alternative I/O path and quantify the significance of knowledge about the internal processing. It is shown that the prediction error is improved significantly with this approach.},
	url	 = {http://superfri.org/superfri/article/view/105},
}

@article{DCFCDKKL16,
	author	 = {Michael Kuhn and Julian Kunkel and Thomas Ludwig},
	title	 = {{Data Compression for Climate Data}},
	year	 = {2016},
	month	 = {06},
	editor	 = {Jack Dongarra and Vladimir Voevodin},
	journal	 = {Supercomputing Frontiers and Innovations},
	series	 = {Volume 3, Number 1},
	pages	 = {75--94},
	doi	 = {https://doi.org/10.14529/jsfi160105},
	url	 = {http://superfri.org/superfri/article/view/101},
}

@article{AAMTDPMBOH15,
	author	 = {Manuel F. Dolz and Julian Kunkel and Konstantinos Chasapis and Sandra Catalan},
	title	 = {{An analytical methodology to derive power models based on hardware and software metrics}},
	year	 = {2015},
	editor	 = {},
	publisher	 = {Springer US},
	journal	 = {Computer Science - Research and Development},
	pages	 = {1--10},
	issn	 = {1865-2042},
	doi	 = {https://doi.org/10.1007/s00450-015-0298-8},
	abstract	 = {The use of models to predict the power consumption of a system is an appealing alternative to wattmeters since they avoid hardware costs and are easy to deploy. In this paper, we present an analytical methodology to build models with a reduced number of features in order to estimate power consumption at node level. We aim at building simple power models by performing a per-component analysis (CPU, memory, network, I/O) through the execution of four standard benchmarks. While they are executed, information from all the available hardware counters and resource utilization metrics provided by the system is collected. Based on correlations among the recorded metrics and their correlation with the instantaneous power, our methodology allows (i) to identify the significant metrics; and (ii) to assign weights to the selected metrics in order to derive reduced models. The reduction also aims at extracting models that are based on a set of hardware counters and utilization metrics that can be obtained simultaneously and, thus, can be gathered and computed on-line. The utility of our procedure is validated using real-life applications on an Intel Sandy Bridge architecture.},
}

@article{MECWSKAHWZ15,
	author	 = {Julian Kunkel and Alvaro Aguilera and Nathanael Hübbe and Marc Wiedemann and Michaela Zimmer},
	title	 = {{Monitoring energy consumption with SIOX}},
	year	 = {2015},
	month	 = {05},
	publisher	 = {Springer},
	journal	 = {Computer Science -- Research and Development},
	series	 = {Volume 30, Number 2},
	pages	 = {125--133},
	issn	 = {1865-2034},
	doi	 = {https://doi.org/10.1007/s00450-014-0271-y},
	abstract	 = {In the face of the growing complexity of HPC systems, their growing energy costs, and the increasing difficulty to run applications efficiently, a number of monitoring tools have been developed during the last years. SIOX  is one such endeavor, with a uniquely holistic approach: Not only does it aim to record a certain kind of data, but to make all relevant data available for analysis and optimization. Among other sources, this encompasses data from hardware energy counters and trace data from different hardware/software layers. However, not all data that can be recorded should be recorded. As such, SIOX  needs good heuristics to determine when and what data needs to be collected, and the energy consumption can provide an important signal about when the system is in a state that deserves closer attention. In this paper, we show that SIOX  can use Likwid to collect and report the energy consumption of applications, and present how this data can be visualized using SIOX’s web-interface. Furthermore, we outline how SIOX  can use this information to intelligently adjust the amount of data it collects, allowing it to reduce the monitoring overhead while still providing complete information about critical situations.},
	url	 = {http://link.springer.com/article/10.1007%2Fs00450-014-0271-y},
}

@article{ESSAASOEKK14,
	author	 = {Julian Kunkel and Michael Kuhn and Thomas Ludwig},
	title	 = {{Exascale Storage Systems -- An Analytical Study of Expenses}},
	year	 = {2014},
	month	 = {06},
	editor	 = {Jack Dongarra and Vladimir Voevodin},
	journal	 = {Supercomputing Frontiers and Innovations},
	series	 = {Volume 1, Number 1},
	pages	 = {116--134},
	url	 = {http://superfri.org/superfri/article/view/20},
}

@article{RTHFWMMAFI13,
	author	 = {Nathanel Hübbe and Julian Kunkel},
	title	 = {{Reducing the HPC-Datastorage Footprint with MAFISC -- Multidimensional Adaptive Filtering Improved Scientific data Compression}},
	year	 = {2013},
	month	 = {05},
	publisher	 = {Springer},
	journal	 = {Computer Science - Research and Development},
	series	 = {Volume 28, Issue 2-3},
	pages	 = {231--239},
	abstract	 = {Large HPC installations today also include large data storage installations. Data compression can significantly reduce the amount of data, and it was one of our goals to find out, how much compression can do for climate data. The price of compression is, of course, the need for additional computational resources, so our second goal was to relate the savings of compression to the costs it necessitates. In this paper we present the results of our analysis of typical climate data. A lossless algorithm based on these insights is developed and its compression ratio is compared to that of standard compression tools. As it turns out, this algorithm is general enough to be useful for a large class of scientific data, which is the reason we speak of MAFISC as a method for scientific data compression. A numeric problem for lossless compression of scientific data is identified and a possible solution is given. Finally, we discuss the economics of data compression in HPC environments using the example of the German Climate Computing Center.},
	url	 = {http://link.springer.com/article/10.1007/s00450-012-0222-4},
}

@article{PCIPOMCZKB13,
	author	 = {Peter Ziegenhein and Cornelis Ph Kamerling and Mark Bangert and Julian Kunkel and Uwe Oelfke},
	title	 = {{Performance-optimized clinical IMRT planning on modern CPUs}},
	year	 = {2013},
	month	 = {05},
	publisher	 = {IOP Publishing},
	journal	 = {Physics in Medicine and Biology},
	series	 = {Volume 58 Number 11},
	issn	 = {1361-6560},
	doi	 = {https://doi.org/10.1088/0031-9155/58/11/3705},
	abstract	 = {Intensity modulated treatment plan optimization is a computationally expensive task. The feasibility of advanced applications in intensity modulated radiation therapy as every day treatment planning, frequent re-planning for adaptive radiation therapy and large-scale planning research severely depends on the runtime of the plan optimization implementation. Modern computational systems are built as parallel architectures to yield high performance. The use of GPUs, as one class of parallel systems, has become very popular in the field of medical physics. In contrast we utilize the multi-core central processing unit (CPU), which is the heart of every modern computer and does not have to be purchased additionally. In this work we present an ultra-fast, high precision implementation of the inverse plan optimization problem using a quasi-Newton method on pre-calculated dose influence data sets. We redefined the classical optimization algorithm to achieve a minimal runtime and high scalability on CPUs. Using the proposed methods in this work, a total plan optimization process can be carried out in only a few seconds on a low-cost CPU-based desktop computer at clinical resolution and quality. We have shown that our implementation uses the CPU hardware resources efficiently with runtimes comparable to GPU implementations, at lower costs.},
	url	 = {http://iopscience.iop.org/0031-9155/58/11/3705},
}

@article{SPPOAASLK12,
	author	 = {Julian Kunkel},
	title	 = {{Simulating parallel programs on application and system level}},
	year	 = {2012},
	month	 = {06},
	publisher	 = {Springer},
	address	 = {Berlin, Heidelberg},
	journal	 = {Computer Science -- Research and Development},
	series	 = {Volume 28 Number 2-3},
	issn	 = {1865-2042},
	doi	 = {https://doi.org/10.1007/s00450-012-0208-2},
	abstract	 = {Understanding the measured performance of parallel applications in real systems is difficult—with the aim to utilize the resources available, optimizations deployed in hardware and software layers build up to complex systems. However, in order to identify bottlenecks the performance must be assessed. This paper introduces PIOsimHD, an event-driven simulator for MPI-IO applications and the underlying (heterogeneous) cluster computers. With the help of the simulator runs of MPI-IO applications can be conducted in-silico; this includes detailed simulation of collective communication patterns as well as simulation of parallel I/O. The simulation estimates upper bounds for expected performance and helps assessing observed performance.Together with HDTrace, an environment which allows tracing the behavior of MPI programs and internals of MPI and PVFS, PIOsimHD enables us to localize inefficiencies, to conduct research on optimizations for communication algorithms, and to evaluate arbitrary and future systems. In this paper the simulator is introduced and an excerpt of the conducted validation is presented, which demonstrates the accuracy of the models for our cluster.},
	url	 = {http://link.springer.com/article/10.1007/s00450-012-0208-2},
}

@article{TIAOHSAAGA13,
	author	 = {Marc Wiedemann and Julian Kunkel and Michaela Zimmer and Thomas Ludwig and Michael Resch and Thomas Bönisch and Xuan Wang and Andriy Chut and Alvaro Aguilera and Wolfgang E. Nagel and Michael Kluge and Holger Mickler},
	title	 = {{Towards I/O Analysis of HPC Systems and a Generic Architecture to Collect Access Patterns}},
	year	 = {2013},
	month	 = {05},
	editor	 = {},
	publisher	 = {Springer New York Inc.},
	address	 = {Hamburg, Berlin, Heidelberg},
	journal	 = {Computer Science - Research and Development},
	series	 = {28},
	pages	 = {241--251},
	issn	 = {1865-2034},
	abstract	 = {In high-performance computing applications, a high-level I/O call will trigger activities on a multitude of hardware components. These are massively parallel systems supported by huge storage systems and internal software layers. Their complex interplay currently makes it impossible to identify the causes for and the locations of I/O bottlenecks. Existing tools indicate when a bottleneck occurs but provide little guidance in identifying the cause or improving the situation. We have thus initiated Scalable I/O for Extreme Performance to find solutions for this problem. To achieve this goal in SIOX, we will build a system to record access information on all layers and components, to recognize access patterns, and to characterize the I/O system. The system will ultimately be able to recognize the causes of the I/O bottlenecks and propose optimizations for the I/O middleware that can improve I/O performance, such as throughput rate and latency. Furthermore, the SIOX system will be able to support decision making while planning new I/O systems. In this paper, we introduce the SIOX system and describe its current status: We first outline our approach for collecting the required access information. We then provide the architectural concept, the methods for reconstructing the I/O path and an excerpt of the interface for data collection. This paper focuses especially on the architecture, which collects and combines the relevant access information along the I/O path, and which is responsible for the efficient transfer of this information. An abstract modelling approach allows us to better understand the complexity of the analysis of the I/O activities on parallel computing systems, and an abstract interface allows us to adapt the SIOX system to various HPC file systems.},
	url	 = {http://link.springer.com/article/10.1007/s00450-012-0221-5},
}

@article{TAESIISTAI11,
	author	 = {Julian Kunkel and Timo Minartz and Michael Kuhn and Thomas Ludwig},
	title	 = {{Towards an Energy-Aware Scientific I/O Interface -- Stretching the ADIOS Interface to Foster Performance Analysis and Energy Awareness}},
	year	 = {2011},
	editor	 = {Thomas Ludwig},
	publisher	 = {Springer},
	address	 = {Berlin / Heidelberg, Germany},
	journal	 = {Computer Science - Research and Development},
	series	 = {1},
	doi	 = {https://doi.org/10.1007/s00450-011-0193-x},
	abstract	 = {Intelligently switching energy saving modes of CPUs, NICs and disks is mandatory to reduce the energy consumption. Hardware and operating system have a limited perspective of future performance demands, thus automatic control is suboptimal. However, it is tedious for a developer to control the hardware by himself. In this paper we propose an extension of an existing I/O interface which on the one hand is easy to use and on the other hand could steer energy saving modes more efficiently. Furthermore, the proposed modifications are beneficial for performance analysis and provide even more information to the I/O library to improve performance. When a user annotates the program with the proposed interface, I/O, communication and computation phases are labeled by the developer. Run-time behavior is then characterized for each phase, this knowledge could be then exploited by the new library.},
}

@article{CECOSDKMKL10,
	author	 = {Julian Kunkel and Olga Mordvinova and Michael Kuhn and Thomas Ludwig},
	title	 = {{Collecting Energy Consumption of Scientific Data}},
	year	 = {2010},
	editor	 = {Thomas Ludwig},
	publisher	 = {Springer},
	address	 = {Berlin / Heidelberg, Germany},
	journal	 = {Computer Science - Research and Development},
	series	 = {3},
	pages	 = {1--9},
	issn	 = {1865-2034},
	doi	 = {https://doi.org/10.1007/s00450-010-0121-5},
	abstract	 = {In this paper the data life cycle management is extended by accounting for energy consumption during the life cycle of files. Information about the energy consumption of data not only allows to account for the correct costs of its life cycle, but also provides a feedback to the user and administrator, and improves awareness of the energy consumption of file I/O. Ideas to realize a storage landscape which determines the energy consumption for maintaining and accessing each file are discussed. We propose to add new extended attributes to file metadata which enable to compute the energy consumed during the life cycle of each file.},
	url	 = {http://www.springerlink.com/content/k3w8277867372413/fulltext.pdf},
}

@article{SOPCOEECHM10,
	author	 = {Timo Minartz and Julian Kunkel and Thomas Ludwig},
	title	 = {{Simulation of power consumption of energy efficient cluster hardware}},
	year	 = {2010},
	editor	 = {Thomas Ludwig},
	publisher	 = {Springer},
	address	 = {Berlin / Heidelberg, Germany},
	journal	 = {Computer Science - Research and Development},
	series	 = {3},
	pages	 = {165--175},
	issn	 = {1865-2034},
	doi	 = {https://doi.org/10.1007/s00450-010-0120-6},
	abstract	 = {In recent years the power consumption of high-performance computing clusters has become a growing problem because the number and size of cluster installations has been rising. The high power consumption of clusters is a consequence of their design goal: High performance. With low utilization, cluster hardware consumes nearly as much energy as when it is fully utilized. Theoretically, in these low utilization phases cluster hardware can be turned off or switched to a lower power consuming state. We designed a model to estimate power consumption of hardware based on the utilization. Applications are instrumented to create utilization trace files for a simulator realizing this model. Different hardware components can be simulated using multiple estimation strategies. An optimal strategy determines an upper bound of energy savings for existing hardware without affecting the time-to-solution. Additionally, the simulator can estimate the power consumption of efficient hardware which is energy-proportional. This way the minimum power consumption can be determined for a given application. Naturally, this minimal power consumption provides an upper bound for any power saving strategy. After evaluating the correctness of the simulator several different strategies and energy-proportional hardware are compared.},
	url	 = {http://www.springerlink.com/content/r21r2376730p7161/fulltext.pdf},
}

@article{IPEWPPIBMR10,
	author	 = {Olga Mordvinova and Dennis Runz and Julian Kunkel and Thomas Ludwig},
	title	 = {{I/O Performance Evaluation with Parabench -- Programmable I/O Benchmark}},
	year	 = {2010},
	publisher	 = {Elsevier B.V},
	address	 = {Amsterdam, Netherlands},
	journal	 = {Procedia Computer Science},
	series	 = {1-1},
	pages	 = {2119--2128},
	issn	 = {1877-0509},
	doi	 = {https://doi.org/10.1016/j.procs.2010.04.238},
	abstract	 = {Choosing an appropriate cluster file system for a specific high performance computing application is challenging and depends mainly on the specific application I/O needs. There is a wide variety of I/O requirements: Some implementations require reading and writing large datasets, others out-of-core data access, or they have database access requirements. Application access patterns reflect different I/O behavior and can be used for performance testing. This paper presents the programmable I/O benchmarking tool Parabench. It has access patterns as input, which can be adapted to mimic behavior for a rich set of applications. Using this benchmarking tool, composed patterns can be automatically tested and easily compared on different local and cluster file systems. Here we introduce the design of the proposed benchmark, focusing on the Parabench programming language, which was developed for flexible pattern creation. We also demonstrate here an exemplary usage of Parabench and its capabilities to handle the POSIX and MPI-IO interfaces.},
	url	 = {http://www.sciencedirect.com/science?_ob=MImg&_imagekey=B9865-506HM1Y-8F-1&_cdi=59117&_user=8764961&_pii=S1877050910002395&_origin=na&_coverDate=05%2F31%2F2010&_sk=999989998&view=c&wchp=dGLbVlW-zSkWA&md5=bbec35a07a4304150a17e28a7ebe67bd&ie=/sdarticle.pdf},
}

@article{FESTBARSPT10,
	author	 = {Kathleen Börner and Johannes Hermle and Christoph Sommer and Nigel P. Brown and Bettina Knapp and Bärbel Glass and Julian Kunkel and Gloria Torralba and Jürgen Reymann and Nina Beil and Jürgen Beneke and Rainer Pepperkok and Reinhard Schneider and Thomas Ludwig and Michael Hausmann and Fred Hamprecht and Holger Erfle and Lars Kaderali and Hans-Georg Kräusslich and Maik J. Lehmann},
	title	 = {{From experimental setup to bioinformatics: an RNAi screening platform to identify host factors involved in HIV-1 replication}},
	year	 = {2010},
	month	 = {01},
	publisher	 = {WILEY-VCH},
	address	 = {Weinheim, Germany},
	journal	 = {Biotechnology Journal},
	series	 = {5-1},
	pages	 = {39--49},
	issn	 = {1860-7314},
	doi	 = {https://doi.org/10.1002/biot.200900226},
	abstract	 = {RNA interference (RNAi) has emerged as a powerful technique for studying loss of function phenotypes by specific down-regulation of gene expression, allowing the investigation of virus-host interactions by large scale high-throughput RNAi screens. Here we comprehensively describe a robust and sensitive siRNA screening platform consisting of an experimental setup, single-cell image analysis and statistical as well as bioinformatics analyses. The workflow has been established to elucidate host gene functions exploited by viruses, monitoring both suppression and enhancement of viral replication simultaneously by fluorescence microscopy. The platform comprises a two-stage procedure in which potential host-factors were first identified in a primary screen and afterwards retested in a validation screen to confirm true positive hits. Subsequent bioinformatics analysis allows the identification of cellular genes participating in metabolic pathways and cellular networks utilized by viruses for efficient infection. Our workflow has been used to investigate host factor usage by the human immunodeficiency virus-1 (HIV 1) but can also be adapted to different viruses. Importantly, the provided platform can be used to guide further screening approaches, thus contributing to fill in current gaps in our understanding of virus-host interactions.},
	url	 = {http://onlinelibrary.wiley.com/doi/10.1002/biot.200900226/pdf},
}

@article{DFSSTEMOIP09,
	author	 = {Michael Kuhn and Julian Kunkel and Thomas Ludwig},
	title	 = {{Dynamic file system semantics to enable metadata optimizations in PVFS}},
	year	 = {2009},
	publisher	 = {John Wiley and Sons Ltd.},
	address	 = {Chichester, UK},
	journal	 = {Concurrency and Computation: Practice and Experience},
	series	 = {21-14},
	pages	 = {1775--1788},
	issn	 = {1532-0626},
	doi	 = {https://doi.org/10.1002/cpe.1439},
	abstract	 = {Modern file systems maintain extensive metadata about stored files. While metadata typically is useful, there are situations when the additional overhead of such a design becomes a problem in terms of performance. This is especially true for parallel and cluster file systems, where every metadata operation is even more expensive due to their architecture. In this paper several changes made to the parallel cluster file system Parallel Virtual File System (PVFS) are presented. The changes target at the optimization of workloads with large numbers of small files. To improve the metadata performance, PVFS was modified such that unnecessary metadata is not managed anymore. Several tests with a large quantity of files were performed to measure the benefits of these changes. The tests have shown that common file system operations can be sped up by a factor of two even with relatively few changes.},
	url	 = {http://onlinelibrary.wiley.com/doi/10.1002/cpe.1439/pdf},
}

@inproceedings{ITOOTRPTRT20,
	author	 = {Frank Gadban and Julian Kunkel and Thomas Ludwig},
	title	 = {{Investigating the Overhead of the REST Protocol to Reveal the Potential for Using Cloud Services for HPC Storage}},
	year	 = {2020},
	month	 = {06},
	booktitle	 = {{High Performance Computing: ISC High Performance 2020 International Workshops, Revised Selected Papers}},
	editor	 = {},
	publisher	 = {Springer},
	series	 = {Lecture Notes in Computer Science},
	conference	 = {ISC HPC},
	location	 = {Frankfurt, Germany},
	isbn	 = {to appear},
	abstract	 = {With the significant advances in Cloud Computing, it is inevitable to explore the usage of Cloud technology in HPC workflows. While many Cloud vendors offer to move complete HPC workloads into the Cloud, this is limited by the massive demand of computing power alongside storage resources typically required by I/O intensive HPC applications. It is widely believed that HPC hardware and software protocols like MPI yield superior performance and lower resource consumption compared to the HTTP transfer protocol used by RESTful Web Services that are prominent in Cloud execution and Cloud storage. With the advent of enhanced versions of HTTP, it is time to reevaluate the effective usage of cloud-based storage in HPC and their ability to cope with various types of data-intensive workloads. In this paper, we investigate the overhead of the REST protocol via HTTP compared to the HPC-native communication protocol MPI when storing and retrieving objects. Albeit we compare the MPI for a communication use case, we can still evaluate the impact of data communication and, therewith, the efficiency of data transfer for data access patterns. We accomplish this by modeling the impact of data transfer using measurable performance metrics. Hence, our contribution is the creation of a performance model based on hardware counters that provide an analytical representation of data transfer over current and future protocols. We validate this model by comparing the results obtained for REST and MPI on two different cluster systems, one equipped with Infiniband and one with Gigabit Ethernet. The evaluation shows that REST can be a viable, performant, and resource-efficient solution, in particular for accessing large files.},
}

@inproceedings{TIOTBWCJIP20,
	author	 = {Eugen Betke and Julian Kunkel},
	title	 = {{The Importance of Temporal Behavior when Classifying Job IO Patterns Using Machine Learning Techniques}},
	year	 = {2020},
	month	 = {06},
	booktitle	 = {{High Performance Computing: ISC High Performance 2020 International Workshops, Revised Selected Papers}},
	editor	 = {},
	publisher	 = {Springer},
	series	 = {Lecture Notes in Computer Science},
	conference	 = {ISC HPC},
	location	 = {Frankfurt, Germany},
	isbn	 = {to appear},
	abstract	 = {Every day, supercomputers execute 1000s of jobs with different characteristics. Data centers monitor the behavior of jobs to support the users and improve the infrastructure, for instance, by optimizing jobs or by determining guidelines for the next procurement. The classification of jobs into groups that express similar run-time behavior aids this analysis as it reduces the number of representative jobs to look into. It is state of the practice to investigate job similarity by looking into job profiles that summarize the dynamics of job execution into one dimension of statistics and neglect the temporal behavior. In this work, we utilize machine learning techniques to cluster and classify parallel jobs based on the similarity in their temporal IO behavior to highlight the importance of temporal behavior when comparing jobs. Our contribution is the qualitative and quantitative evaluation of different IO characterizations and similarity measurements that work toward the development of a suitable clustering algorithm. We explore IO characteristics from monitoring data of one million parallel jobs and cluster them into groups of similar jobs. Therefore, the time series of various IO statistics is converted into features using different similarity metrics that customize the classification. We discuss conventional ML techniques that are applied to job profiles and contrast this with the analysis of time series data where we apply the Levenshtein distance as a distance metrics. While the employed Levenshtein algorithms aren’t yet optimal, the results suggest that temporal behavior is key to identify related pattern.},
}

@inproceedings{SAOIBBITIC20,
	author	 = {Eugen Betke and Julian Kunkel},
	title	 = {{Semi-automatic Assessment of I/O Behavior by Inspecting the Individual Client-Node Timelines -- An Explorative Study on 10^6 Jobs}},
	year	 = {2020},
	month	 = {06},
	booktitle	 = {{High Performance Computing: ISC High Performance 2020}},
	editor	 = {},
	publisher	 = {Springer},
	series	 = {Lecture Notes in Computer Science},
	conference	 = {ISC HPC},
	location	 = {Frankfurt, Germany},
	abstract	 = {HPC applications with suboptimal I/O behavior interfere with well-behaving applications and lead to increased application runtime. In some cases, this may even lead to unresponsive systems and unfinished jobs. HPC monitoring systems can aid users and support staff to identify  problematic behavior and support optimization of problematic applications. The key issue is how to identify relevant applications? A profile of an application doesn't allow to identify problematic phases during the execution but tracing of each individual I/O is too invasive. In this work, we split the execution into segments, i.e., windows of fixed size and analyze profiles of them.       We develop three I/O metrics to identify three relevant classes of inefficient I/O behaviors, and evaluate them on raw data of 1,000,000  jobs on the supercomputer Mistral. The advantages of our method is that temporal information about I/O activities during job runtime is preserved to some extent and can be used to identify phases of inefficient I/O. The main contribution of this work is the segmentation of time series and computation of metrics (Job-I/O-Utilization, Job-I/O-Problem-Time, and Job-I/O-Balance) that are effective to identify problematic I/O phases and jobs.},
}

@inproceedings{WCDMNARTTM19,
	author	 = {Max Lübbering and Julian Kunkel and Patricio Farrell},
	title	 = {{What Company Does My News Article Refer to? Tackling Multi Class Problems With Topic Modeling}},
	year	 = {2019},
	month	 = {09},
	booktitle	 = {{Proceedings of the Conference on "Lernen, Wissen, Daten, Analysen",
               Berlin, Germany, September 30 - October 2, 2019}},
	editor	 = {Robert Jäschke and Matthias Weidlich},
	publisher	 = {CEUR-WS.org},
	series	 = {CEUR Workshop Proceedings},
	number	 = {2454},
	pages	 = {353--364},
	conference	 = {LWDA 2019},
	location	 = {Berlin, Germany},
	abstract	 = {While it is technically trivial to search for the company name to predict the company a new article refers to, it often leads to wrong results. In this article, we compare the two approaches bag-of-words with k-nearest neighbors and Latent Dirichlet Allocation with k-nearest neighbor by assessing their applicability for predicting the S\&P 500 company which is mentioned in a business news article or press release. Both approaches are evaluated on a corpus of 13k documents containing 84\% news articles and 16\% press releases. While the bag-of-words approach yields accurate predictions, it is highly inefficient due to its gigantic feature space. The Latent Dirichlet Allocation approach, on the other hand, manages to achieve roughly the same prediction accuracy (0.58 instead of 0.62) but reduces the feature space by a factor of seven.},
	url	 = {https://pages.cms.hu-berlin.de/ipa/lwda2019/},
}

@inproceedings{OMBEWUKMJK20,
	author	 = {Nabeeh Jumah and Julian Kunkel},
	title	 = {{Optimizing Memory Bandwidth Efficiency with User-Preferred Kernel Merge}},
	year	 = {2020},
	month	 = {05},
	booktitle	 = {{Euro-Par 2019: Parallel Processing Workshops}},
	editor	 = {Ulrich Schwardmann and Christian Boehme and Dora B. Heras and Valeria Cardellini and Emmanuel Jeannot and Antonio Salis and Claudio Schifanella and Ravi Reddy Manumachu and Dieter Schwamborn and Laura Ricci and Oh Sangyoon and Thomas Gruber and Laura Antonelli and Stephen L. Scott},
	publisher	 = {Springer},
	series	 = {Lecture Notes in Computer Science},
	number	 = {11997},
	pages	 = {69-81},
	conference	 = {COLOC - Workshop on Data Locality},
	location	 = {Göttingen, Germany},
	isbn	 = {978-3-030-48340-1},
	issn	 = {1611-3349},
	doi	 = {https://doi.org/10.1007/978-3-030-48340-1_6},
	abstract	 = {Earth system modeling computations use stencils extensively while running many kernels. Optimal coding of the stencils is essential to efficiently use memory bandwidth of an underlying hardware. This is important as stencil computations are memory bound.  Even when the code within one kernel is written to optimally use the memory bandwidth, there could be still opportunities to further do some optimization at the inter-kernel level. Stencils naturally exhibit data locality, and executing a sequence of stencils within separate kernels could waste caching capabilities. Merging the kernels allows to improve the use of the caches.  Some tools were developed to automatically fuse loops instead of the manual optimization. However, scientists still apply fusion in different levels of loop nests manually to find optimal performance. To allow scientists to still apply loop fusions equal to manual loop fusion, we develop a technique to automatically analyse the code and allow scientists to apply there preferred fusions without doing the effort of dependency analysis and code transformation. Our work is done using GGDML language extensions which enables performance portability over different architectures using a single source code.},
}

@inproceedings{SPOSUMJK19,
	author	 = {Nabeeh Jumah and Julian Kunkel},
	title	 = {{Scalable Parallelization of Stencils using MODA}},
	year	 = {2019},
	month	 = {12},
	booktitle	 = {{High Performance Computing: ISC High Performance 2019 International Workshops, Frankfurt/Main, Germany, June 16-20, 2019, Revised Selected Papers}},
	editor	 = {Michèle Weiland and Guido Juckeland and Sadaf Alam and Heike Jagode},
	publisher	 = {Springer},
	series	 = {Lecture Notes in Computer Science},
	number	 = {11887},
	pages	 = {142--154},
	conference	 = {P^3MA workshop, ISC HPC},
	location	 = {Frankfurt, Germany},
	isbn	 = {978-3-030-34356-9},
	issn	 = {1611-3349},
	doi	 = {https://doi.org/10.1007/978-3-030-34356-9_13},
	abstract	 = {The natural and the design limitations of the evolution of processors, e.g., frequency scaling and memory bandwidth bottlenecks, push towards scaling applications on multiple-node configurations besides to exploiting the power of each single node. This introduced new challenges to porting applications to the new infrastructure, especially with the heterogeneous environments. Domain decomposition and handling the resulting necessary communication is not a trivial task. Parallelizing code automatically cannot be decided by tools in general as a result of the semantics of the general-purpose languages. To allow scientists to avoid such problems, we introduce the Memory-Oblivious Data Access (MODA) technique, and use it to scale code to configurations ranging from a single node to multiple nodes, supporting different architectures, without requiring changes in the source code of the application. We present a technique to automatically identify necessary communication based on higher-level semantics. The extracted information enables tools to generate code that handles the communication. A prototype is developed to implement the techniques and used to evaluate the approach. The results show the effectiveness of using the techniques to scale code on multi-core processors and on GPU based machines. Comparing the ratios of the achieved GFLOPS to the number of nodes in each run, and repeating that on different numbers of nodes shows that the achieved scaling efficiency is around 100\%. This was repeated with up to 100 nodes. An exception to this is the single-node configuration using a GPU, in which no communication is needed, and hence, no data movement between GPU and host memory is needed, which yields higher GFLOPS.},
	url	 = {https://link.springer.com/chapter/10.1007/978-3-030-34356-9_13},
}

@inproceedings{FPIMLTCAIB19,
	author	 = {Eugen Betke and Julian Kunkel},
	title	 = {{Footprinting Parallel I/O – Machine Learning to Classify Application’s I/O Behavior}},
	year	 = {2019},
	month	 = {12},
	booktitle	 = {{High Performance Computing: ISC High Performance 2019 International Workshops, Frankfurt/Main, Germany, June 16-20, 2019, Revised Selected Papers}},
	editor	 = {Michèle Weiland and Guido Juckeland and Sadaf Alam and Heike Jagode},
	publisher	 = {Springer},
	series	 = {Lecture Notes in Computer Science},
	number	 = {11887},
	pages	 = {214--226},
	conference	 = {HPC IODC workshop, ISC HPC},
	location	 = {Frankfurt, Germany},
	isbn	 = {978-3-030-34356-9},
	issn	 = {1611-3349},
	doi	 = {https://doi.org/10.1007/978-3-030-34356-9_18},
	abstract	 = {It is not uncommon to run tens thousands of parallel jobs on large HPC systems. The amount of data collected by monitoring systems on such systems is immense. Checking each job individually by hand, e.g. for identification of high workload or anomaly detection, is hardly feasible. Therefore we are looking for an automated approach, that can do this task.  Many automated approaches are looking at job statistics over the entire job runtime. Information about different activities during the job execution is lost. In our work, for each job, we reduce the collected monitoring data to a sequence of I/O behavior. Then, we convert the sequence to a footprint vector, where each element shows how often this behavior occurs. After that, the footprint dataset is classified to identify applications with similar I/O behavior. Human understandable class interpretation is the only non-automatic step in the workflow.  The contribution of this paper is a data reduction technique for monitoring data and an automated job classification method.},
	url	 = {https://link.springer.com/chapter/10.1007/978-3-030-34356-9_18},
}

@inproceedings{TUISVPKB19,
	author	 = {Julian Kunkel and Eugen Betke},
	title	 = {{Tracking User-Perceived I/O Slowdown via Probing}},
	year	 = {2019},
	month	 = {12},
	booktitle	 = {{High Performance Computing: ISC High Performance 2019 International Workshops, Frankfurt/Main, Germany, June 16-20, 2019, Revised Selected Papers}},
	editor	 = {Michèle Weiland and Guido Juckeland and Sadaf Alam and Heike Jagode},
	publisher	 = {Springer},
	series	 = {Lecture Notes in Computer Science},
	number	 = {11887},
	pages	 = {169--182},
	conference	 = {HPC-IODC workshop, ISC HPC},
	location	 = {Frankfurt, Germany},
	isbn	 = {978-3-030-34356-9},
	issn	 = {1611-3349},
	doi	 = {https://doi.org/10.1007/978-3-030-34356-9_15},
	abstract	 = {The perceived I/O performance of a shared file system heavily depends on the usage pattern expressed by all concurrent jobs. From the perspective of a single user or job, the achieved I/O throughput can vary significantly due to activities conducted by other users or system services like RAID rebuilds. As these activities are hidden, users wonder about the cause of observed slowdown and may contact the service desk to report an unusual slow system. In this paper, we present a methodology to investigate and quantify the user-perceived slowdown which sheds light on the perceivable file system performance. This is achieved by deploying a monitoring system on a client node that constantly probes the performance of various data and metadata operations and then compute a slowdown factor. This information could be acquired and visualized in a timely fashion, informing the users about the expected slowdown. To evaluate the method, we deploy the monitoring on three data centers and explore the gathered data for up to a period of 60 days. A verification of the method is conducted by investigating the metrics while running the IO-500 benchmark. We conclude that this approach is able to reveal short-term and long-term interference.},
	url	 = {https://link.springer.com/chapter/10.1007/978-3-030-34356-9_15},
}

@inproceedings{AOPIUOTUNS19,
	author	 = {Andrew Turner and Dominic Sloan-Murphy and Karthee Sivalingam and Harvey Richardson and Julian Kunkel},
	title	 = {{Analysis of parallel I/O use on the UK national supercomputing service, ARCHER using Cray's LASSi and EPCC SAFE}},
	year	 = {2019},
	month	 = {10},
	editor	 = {},
	conference	 = {CUG},
	location	 = {Montreal, Canada},
	abstract	 = {In this paper, we describe how we have used a combination of the LASSi tool (developed by Cray) and the SAFE software (developed by EPCC) to collect and analyse Lustre I/O performance data for all jobs running on the UK national supercomputing service, ARCHER; and to provide reports on I/O usage for users in our standard reporting framework. We also present results from analysis of parallel I/O use on ARCHER and analysis on the potential impact of different applications on file system performance using metrics we have derived from the LASSi data. We show that the performance data from LASSi reveals how the same application can stress different components of the file system depending on how it is run, and how the LASSi risk metrics allow us to identify use cases that could potentially cause issues for global I/O performance and work with users to improve their I/O use. We use the IO-500 benchmark to help us understand how LASSi risk metrics correspond to observed performance on the ARCHER file systems. We also use LASSi data imported into SAFE to identify I/O use patterns associated with different research areas, understand how the research workflow gives rise to the observed patterns and project how this will affect I/O requirements in the future. Finally, we provide an overview of likely future directions for the continuation of this work.},
	url	 = {https://cug.org/proceedings/cug2019_proceedings/includes/files/pap118s2-file1.pdf},
}

@inproceedings{AVOSCWTGLE19,
	author	 = {Nabeeh Jum'ah and Julian Kunkel},
	title	 = {{Automatic Vectorization of Stencil Codes with the GGDML Language Extensions}},
	year	 = {2019},
	booktitle	 = {{Workshop on Programming Models for SIMD/Vector Processing
(WPMVP'19), February 16, 2019, Washington, DC, USA}},
	editor	 = {},
	publisher	 = {ACM},
	address	 = {New York, NY, USA},
	series	 = {WPMVP},
	pages	 = {1--7},
	conference	 = {WPMVP-2019},
	organization	 = {PPoPP 2019},
	location	 = {Washington DC, USA},
	isbn	 = {978-1-4503-6291-7/19/02},
	doi	 = {https://doi.org/10.1145/3303117.3306160},
	abstract	 = {Partial differential equation (PDE) solvers are important for many applications. PDE solvers execute kernels which apply stencil operations over 2D and 3D grids. As PDE solvers and stencil codes are widely used in performance critical applications, they must be well optimized. Stencil computations naturally depend on neighboring grid elements. Therefore, data locality must be exploited to optimize the code and to better use the memory band- width – at the same time, vector processing capabilities of the processor must be utilized. In this work, we investigate the effectiveness of using high-level language extensions to exploit SIMD and vectorization features of multi-core processors and vector engines. We write a prototype application using the GGDML high-level language extensions, and translate the high-level code with different configurations to investigate the efficiency of the language extensions and the source-to-source translation process to exploit the vector units of the multi-core processors and the vector engines. The conducted experiments demonstrate the effectiveness of the language extensions and the translation tool to generate vectorized codes, which makes use of the natural data locality of stencil computations.},
	url	 = {https://ppopp19.sigplan.org/home/WPMVP-2019},
}

@inproceedings{TUIBIHWLSC19,
	author	 = {Jakob Lüttgau and Shane Snyder and Philip Carns and Justin M. Wozniak and Julian Kunkel and Thomas Ludwig},
	title	 = {{Toward Understanding I/O Behavior in HPC Workflows}},
	year	 = {2019},
	month	 = {02},
	booktitle	 = {{IEEE/ACM 3rd International Workshop on Parallel Data Storage \& Data Intensive Scalable Computing Systems (PDSW-DISCS)}},
	editor	 = {},
	publisher	 = {IEEE Computer Society},
	address	 = {Washington, DC, USA},
	pages	 = {64--75},
	conference	 = {PDSW-DISCS},
	location	 = {Dallas, Texas},
	isbn	 = {978-1-7281-0192-7},
	doi	 = {https://doi.org/10.1109/PDSW-DISCS.2018.00012},
	abstract	 = {Scientific discovery increasingly depends on complex workflows consisting of multiple phases and sometimes millions of parallelizable tasks or pipelines. These workflows access storage resources for a variety of purposes, including preprocessing, simulation output, and postprocessing steps. Unfortunately, most workflow models focus on the scheduling and allocation of computational resources for tasks while the impact on storage systems remains a secondary objective and an open research question. I/O performance is not usually accounted for in workflow telemetry reported to users. In this paper, we present an approach to augment the I/O efficiency of the individual tasks of workflows by combining workflow description frameworks with system I/O telemetry data. A conceptual architecture and a prototype implementation for HPC data center deployments are introduced. We also identify and discuss challenges that will need to be addressed by workflow management and monitoring systems for HPC in the future. We demonstrate how real-world applications and workflows could benefit from the approach, and we show how the approach helps communicate performance-tuning guidance to users.},
}

@inproceedings{BDBIWACSLK19,
	author	 = {Bryan N. Lawrence and Julian Kunkel and Jonathan Churchill and Neil Massey and Philip Kershaw and Matt Pritchard},
	title	 = {{Beating data bottlenecks in weather and climate science}},
	year	 = {2019},
	month	 = {01},
	booktitle	 = {{Extreme Data Workshop 2018}},
	editor	 = {Martin Schultz and Dirk Pleiter and Peter Bauer},
	publisher	 = {Forschungszentrum Jülich},
	series	 = {Schriften des Forschungszentrums Jülich IAS Series},
	number	 = {40},
	pages	 = {31--36},
	conference	 = {Extreme Data Workshop},
	location	 = {Jülich, Germany},
	isbn	 = {978-3-95806-392-1},
	issn	 = {1868-8489},
	abstract	 = {The data volumes produced by simulation and observation are large, and growing rapidly. In the case of simulation, plans for future modelling programmes require complicated orchestration of data, and anticipate large user communities. “Download and work at home” is no longer practical for many use-cases. In the case of simulation, these issues are exacerbated by users who want simulation data at grid point resolution instead of at the resolution resolved by the mathematics, and/or who design numerical experiments without knowledge of the storage costs. There is no simple solution to these problems: user education, smarter compression, and better use of tiered storage and smarter workflows are all necessary – but far from sufficient. In this paper, we introduce two approaches to addressing (some) of these data bottlenecks: dedicated data analysis platforms, and smarter storage software. We provide a brief introduction to the JASMIN data storage and analysis facility, and some of the storage tools and approaches being developed by the ESiWACE project. In doing so, we describe some of our observations of real world data handling problems at scale, from the generic performance of file systems to the difficulty of optimising both volume stored and performance of workflows. We use these examples to motivate the two-pronged approach of smarter hardware and smarter software – but recognise that data bottlenecks may yet limit the aspirations of our science.},
	url	 = {https://pdfs.semanticscholar.org/9881/ed9d9e16cb70fba9456fb0905bf28c450ce0.pdf#page=38},
}

@inproceedings{CAPMFESDMA19,
	author	 = {Jakob Lüttgau and Julian Kunkel},
	title	 = {{Cost and Performance Modeling for Earth System Data Management and Beyond}},
	year	 = {2019},
	month	 = {01},
	booktitle	 = {{High Performance Computing: ISC High Performance 2018 International Workshops, Frankfurt/Main, Germany, June 28, 2018, Revised Selected Papers}},
	editor	 = {Rio Yokota and Michele Weiland and John Shalf and Sadaf Alam},
	publisher	 = {Springer},
	series	 = {Lecture Notes in Computer Science},
	number	 = {11203},
	pages	 = {23--35},
	conference	 = {HPC-IODC workshop, ISC HPC},
	organization	 = {ISC Team},
	location	 = {Frankfurt, Germany},
	isbn	 = {978-3-030-02465-9},
	issn	 = {1611-3349},
	doi	 = {https://doi.org/10.1007/978-3-030-02465-9_2},
	abstract	 = {Current and anticipated storage environments confront domain scientist and data center operators with usability, performance and cost challenges. The amount of data upcoming system will be required to handle is expected to grow exponentially, mainly due to increasing resolution and affordable compute power. Unfortunately, the relationship between cost and performance is not always well understood requiring considerable effort for educated procurement. Within the Centre of Excellence in Simulation of Weather and Climate in Europe (ESiWACE) models to better understand cost and performance of current and future systems are being explored. This paper presents models and methodology focusing on, but not limited to, data centers used in the context of climate and numerical weather prediction. The paper concludes with a case study of alternative deployment strategies and outlines the challenges anticipating their impact on cost and performance. By publishing these early results, we would like to make the case to work towards standard models and methodologies collaboratively as a community to create sufficient incentives for vendors to provide specifications in formats which are compatible to these modeling tools. In addition to that, we see application for such formalized models and information in I/O re lated middleware, which are expected to make automated but reasonable decisions in increasingly heterogeneous data centers.},
}

@inproceedings{COCASTUSKT18,
	author	 = {Raul Torres and Julian Kunkel and Manuel F. Dolz and Thomas Ludwig},
	title	 = {{Comparison of Clang Abstract Syntax Trees using String Kernels}},
	year	 = {2018},
	month	 = {11},
	booktitle	 = {{2018 International Conference on High Performance Computing \& Simulation (HPCS)}},
	editor	 = {},
	publisher	 = {IEEE Computer Society},
	address	 = {Washington, DC, USA},
	pages	 = {106--113},
	conference	 = {HPCS 2018},
	location	 = {Orleans, France},
	isbn	 = {978-1-5386-7879-4},
	doi	 = {https://doi.org/10.1109/HPCS.2018.00032},
	abstract	 = {Abstract Syntax Trees (ASTs) are intermediate representations widely used by compiler frameworks. One of their strengths is that they can be used to determine the similarity among a collection of programs. In this paper we propose a novel comparison method that converts ASTs into weighted strings in order to get similarity matrices and quantify the level of correlation among codes. To evaluate the approach, we leveraged the corresponding strings derived from the Clang ASTs of a set of 100 source code examples written in C. Our kernel and two other string kernels from the literature were used to obtain similarity matrices among those examples. Next, we used Hierarchical Clustering to visualize the results. Our solution was able to identify different clusters conformed by examples that shared similar semantics. We demonstrated that the proposed strategy can be promisingly applied to similarity problems involving trees or strings.},
}

@inproceedings{PPOESMWUGC19,
	author	 = {Nabeeh Jum'ah and Julian Kunkel},
	title	 = {{Performance Portability of Earth System Models with User-Controlled GGDML code Translation}},
	year	 = {2019},
	month	 = {01},
	booktitle	 = {{High Performance Computing: ISC High Performance 2018 International Workshops, Frankfurt/Main, Germany, June 28, 2018, Revised Selected Papers}},
	editor	 = {Rio Yokota and Michele Weiland and John Shalf and Sadaf Alam},
	publisher	 = {Springer},
	series	 = {Lecture Notes in Computer Science},
	number	 = {11203},
	pages	 = {693--710},
	conference	 = {P3MA workshop, ISC HPC},
	organization	 = {ISC Team},
	location	 = {Frankfurt, Germany},
	isbn	 = {978-3-030-02465-9},
	issn	 = {1611-3349},
	doi	 = {https://doi.org/10.1007/978-3-030-02465-9_50},
	abstract	 = {The increasing need for performance of earth system modeling and other scientific domains pushes the computing technologies in diverse architectural directions. The development of models needs technical expertise and skills of using tools that are able to exploit the hardware capabilities. The heterogeneity of architectures complicates the development and the maintainability of the models. To improve the software development process of earth system models, we provide an approach that simplifies the code maintainability by fostering separation of concerns while providing performance portability. We propose the use of high-level language extensions that reflect scientific concepts. The scientists can use the programming language of their own choice to develop models, however, they can use the language extensions optionally wherever they need. The code translation is driven by configurations that are separated from the model source code. These configurations are prepared by scientific programmers to optimally use the machine’s features. The main contribution of this paper is the demonstration of a user-controlled source-to-source translation technique of earth system models that are written with higher-level semantics. We discuss a flexible code translation technique that is driven by the users through a configuration input that is prepared especially to transform the code, and we use this technique to produce OpenMP or OpenACC enabled codes besides MPI to support multi-node configurations.},
}

@inproceedings{UMLWMKM19,
	author	 = {Julian Kunkel and George S. Markomanolis},
	title	 = {{Understanding Metadata Latency with MDWorkbench}},
	year	 = {2019},
	month	 = {01},
	booktitle	 = {{High Performance Computing: ISC High Performance 2018 International Workshops, Frankfurt/Main, Germany, June 28, 2018, Revised Selected Papers}},
	editor	 = {Rio Yokota and Michele Weiland and John Shalf and Sadaf Alam},
	publisher	 = {Springer},
	series	 = {Lecture Notes in Computer Science},
	number	 = {11203},
	pages	 = {75--88},
	conference	 = {WOPSSS workshop, ISC HPC},
	organization	 = {ISC Team},
	location	 = {Frankfurt, Germany},
	isbn	 = {978-3-030-02465-9},
	issn	 = {1611-3349},
	doi	 = {https://doi.org/10.1007/978-3-030-02465-9_5},
	abstract	 = {While parallel file systems often satisfy the need of applica- tions with bulk synchronous I/O, they lack capabilities of dealing with metadata intense workloads. Typically, in procurements, the focus lies on the aggregated metadata throughput using the MDTest benchmark. However, metadata performance is crucial for interactive use. Metadata benchmarks involve even more parameters compared to I/O benchmarks. There are several aspects that are currently uncovered and, therefore, not in the focus of vendors to investigate. Particularly, response latency and interactive workloads operating on a working set of data. The lack of ca- pabilities from file systems can be observed when looking at the IO-500 list, where metadata performance between best and worst system does not differ significantly. In this paper, we introduce a new benchmark called MDWorkbench which generates a reproducible workload emulating many concurrent users or – in an alternative view – queuing systems. This benchmark pro- vides a detailed latency profile, overcomes caching issues, and provides a method to assess the quality of the observed throughput. We evaluate the benchmark on state-of-the-art parallel file systems with GPFS (IBM Spectrum Scale), Lustre, Cray’s Datawarp, and DDN IME, and conclude that we can reveal characteristics that could not be identified before.},
}

@inproceedings{BODIAIFSFI19,
	author	 = {Eugen Betke and Julian Kunkel},
	title	 = {{Benefit of DDN's IME-Fuse and IME-Lustre File Systems for I/O Intensive HPC Applications}},
	year	 = {2019},
	month	 = {01},
	booktitle	 = {{High Performance Computing: ISC High Performance 2018 International Workshops, Frankfurt/Main, Germany, June 28, 2018, Revised Selected Papers}},
	editor	 = {Rio Yokota and Michele Weiland and John Shalf and Sadaf Alam},
	publisher	 = {Springer},
	series	 = {Lecture Notes in Computer Science},
	number	 = {11203},
	pages	 = {131--144},
	conference	 = {WOPSSS workshop, ISC HPC},
	organization	 = {ISC Team},
	location	 = {Frankfurt, Germany},
	isbn	 = {978-3-030-02465-9},
	issn	 = {1611-3349},
	doi	 = {https://doi.org/10.1007/978-3-030-02465-9_9},
	abstract	 = {Many scientific applications are limited by I/O performance offered by parallel file systems on conventional storage systems. Flash- based burst buffers provide significant better performance than HDD backed storage, but at the expense of capacity. Burst buffers are consid- ered as the next step towards achieving wire-speed of interconnect and providing more predictable low latency I/O, which are the holy grail of storage. A critical evaluation of storage technology is mandatory as there is no long-term experience with performance behavior for particular applica- tions scenarios. The evaluation enables data centers choosing the right products and system architects the integration in HPC architectures. This paper investigates the native performance of DDN-IME, a flash- based burst buffer solution. Then, it takes a closer look at the IME-FUSE file systems, which uses IMEs as burst buffer and a Lustre file system as back-end. Finally, by utilizing a NetCDF benchmark, it estimates the performance benefit for climate applications.},
}

@inproceedings{TFAPIKBBCF19,
	author	 = {Julian Kunkel and Eugen Betke and Matt Bryson and Philip Carns and Rosemary Francis and Wolfgang Frings and Roland Laifer and Sandra Mendez},
	title	 = {{Tools for Analyzing Parallel I/O}},
	year	 = {2019},
	month	 = {01},
	booktitle	 = {{High Performance Computing: ISC High Performance 2018 International Workshops, Frankfurt/Main, Germany, June 28, 2018, Revised Selected Papers}},
	editor	 = {Rio Yokota and Michele Weiland and John Shalf and Sadaf Alam},
	publisher	 = {Springer},
	series	 = {Lecture Notes in Computer Science},
	number	 = {11203},
	pages	 = {49--70},
	conference	 = {HPC-IODC workshop, ISC HPC},
	organization	 = {ISC Team},
	location	 = {Frankfurt, Germany},
	isbn	 = {978-3-030-02465-9},
	issn	 = {1611-3349},
	doi	 = {https://doi.org/10.1007/978-3-030-02465-9_4},
	abstract	 = {Parallel application I/O performance often does not meet user expectations. Additionally, slight access pattern modifications may lead to significant changes in performance due to complex interactions between hardware and software. These issues call for sophisticated tools to capture, analyze, understand, and tune application I/O. In this paper, we highlight advances in monitoring tools to help address these issues. We also describe best practices, identify issues in measure- ment and analysis, and provide practical approaches to translate parallel I/O analysis into actionable outcomes for users, facility operators, and researchers.},
}

@inproceedings{TGSDCTHIIA19,
	author	 = {Yevhen Alforov and Anastasiia Novikova and Michael Kuhn and Julian Kunkel and Thomas Ludwig},
	title	 = {{Towards Green Scientific Data Compression Through High-Level I/O Interfaces}},
	year	 = {2019},
	month	 = {02},
	booktitle	 = {{30th International Symposium on Computer Architecture and High Performance Computing}},
	editor	 = {},
	publisher	 = {IEEE Computer Society},
	address	 = {Washington, DC, USA},
	pages	 = {209--216},
	conference	 = {SBAC-PAD 2018},
	location	 = {Lyon, France},
	isbn	 = {978-1-5386-7769-8},
	issn	 = {1550-6533},
	doi	 = {https://doi.org/10.1109/CAHPC.2018.8645921},
	abstract	 = {Every HPC system today has to cope with a deluge of data generated by scientific applications, simulations or large- scale experiments. The upscaling of supercomputer systems and infrastructures, generally results in a dramatic increase of their energy consumption. In this paper, we argue that techniques like data compression can lead to significant gains in terms of power efficiency by reducing both network and storage requirements. To that end, we propose a novel methodology for achieving on-the-fly intelligent determination of energy efficient data reduction for a given data set by leveraging state-of-the-art compression algorithms and meta data at application-level I/O. We motivate our work by analyzing the energy and storage saving needs of real-life scientific HPC applications, and review the various compression techniques that can be applied. We find that the resulting data reduction can decrease the data volume transferred and stored by as much as 80\% in some cases, consequently leading to significant savings in storage and networking costs.},
}

@inproceedings{SOHSSFTAQL17,
	author	 = {Jakob Lüttgau and Julian Kunkel},
	title	 = {{Simulation of Hierarchical Storage Systems for TCO and QoS}},
	year	 = {2017},
	booktitle	 = {{High Performance Computing: ISC High Performance 2017 International Workshops, DRBSD, ExaComm, HCPM, HPC-IODC, IWOPH, IXPUG, P^3MA, VHPC, Visualization at Scale, WOPSSS}},
	editor	 = {Julian Kunkel and Rio Yokota and Michaela Taufer and John Shalf},
	publisher	 = {Springer},
	series	 = {Lecture Notes in Computer Science},
	number	 = {10524},
	pages	 = {116--128},
	conference	 = {ISC High Performance},
	location	 = {Frankfurt, Germany},
	isbn	 = {978-3-319-67629-6},
	doi	 = {https://doi.org/10.1007/978-3-319-67630-2_12},
	abstract	 = {Due to the variety of storage technologies deep storage hierarchies turn out to be the most feasible choice to meet performance and cost requirements when handling vast amounts of data. Long-term archives employed by scientific users are mainly reliant on tape storage, as it remains the most cost-efficient option. Archival systems are often loosely integrated into the HPC storage infrastructure. In expectation of exascale systems and in situ analysis also burst buffers will require integration with the archive. Exploring new strategies and developing open software for tape systems is a hurdle due to the lack of affordable storage silos and availability outside of large organizations and due to increased wariness requirements when dealing with ultra-durable data. Lessening these problems by providing virtual storage silos should enable community-driven innovation and enable site operators to add features where they see fit while being able to verify strategies before deploying on production systems. Different models for the individual components in tape systems are developed. The models are then implemented in a prototype simulation using discrete event simulation. The work shows that the simulations can be used to approximate the behavior of tape systems deployed in the real world and to conduct experiments without requiring a physical tape system.},
}

@inproceedings{RIOHAWSEGA17,
	author	 = {Eugen Betke and Julian Kunkel},
	title	 = {{Real-Time I/O-Monitoring of HPC Applications with SIOX, Elasticsearch, Grafana and FUSE}},
	year	 = {2017},
	booktitle	 = {{High Performance Computing: ISC High Performance 2017 International Workshops, DRBSD, ExaComm, HCPM, HPC-IODC, IWOPH, IXPUG, P^3MA, VHPC, Visualization at Scale, WOPSSS}},
	editor	 = {Julian Kunkel and Rio Yokota and Michaela Taufer and John Shalf},
	publisher	 = {Springer},
	series	 = {Lecture Notes in Computer Science},
	number	 = {10524},
	pages	 = {158--170},
	conference	 = {ISC High Performance},
	location	 = {Frankfurt, Germany},
	isbn	 = {978-3-319-67629-6},
	doi	 = {https://doi.org/10.1007/978-3-319-67630-2},
	abstract	 = {The starting point for our work was a demand for an overview of application’s I/O behavior, that provides information about the usage of our HPC “Mistral”. We suspect that some applications are running using inefficient I/O patterns, and probably, are wasting a significant amount of machine hours. To tackle the problem, we focus on detection of poor I/O performance, identification of these applications, and description of I/O behavior. Instead of gathering I/O statistics from global system variables, like many other monitoring tools do, in our approach statistics come directly from I/O interfaces POSIX, MPI, HDF5 and NetCDF. For interception of I/O calls we use an instrumentation library that is dynamically linked with LD_PRELOAD at program startup. The HPC on-line monitoring framework is built on top of open source software: Grafana, SIOX, Elasticsearch and FUSE. This framework collects I/O statistics from applications and mount points. The latter is used for non-intrusive monitoring of virtual memory allocated with mmap(), i.e., no code adaption is necessary. The framework is evaluated showing its effectiveness and critically discussed.},
}

@inproceedings{TDTSOCAFQC17,
	author	 = {Julian Kunkel and Anastasiia Novikova and Eugen Betke and Armin Schaare},
	title	 = {{Toward Decoupling the Selection of Compression Algorithms from Quality Constraints}},
	year	 = {2017},
	booktitle	 = {{High Performance Computing: ISC High Performance 2017 International Workshops, DRBSD, ExaComm, HCPM, HPC-IODC, IWOPH, IXPUG, P^3MA, VHPC, Visualization at Scale, WOPSSS}},
	editor	 = {Julian Kunkel and Rio Yokota and Michaela Taufer and John Shalf},
	publisher	 = {Springer},
	series	 = {Lecture Notes in Computer Science},
	number	 = {10524},
	pages	 = {1--12},
	conference	 = {ISC High Performance},
	location	 = {Frankfurt, Germany},
	isbn	 = {978-3-319-67629-6},
	doi	 = {https://doi.org/10.1007/978-3-319-67630-2_1},
	abstract	 = {Data intense scientific domains use data compression to reduce the storage space needed. Lossless data compression preserves the original information accurately but on the domain of climate data usually yields a compression factor of only 2:1. Lossy data compression can achieve much higher compression rates depending on the tolerable error/precision needed. Therefore, the field of lossy compression is still subject to active research. From the perspective of a scientist, the compression algorithm does not matter but the qualitative information about the implied loss of precision of data is a concern. With the Scientific Compression Library (SCIL), we are developing a meta-compressor that allows users to set various quantities that define the acceptable error and the expected performance behavior. The ongoing work a preliminary stage for the design of an automatic compression algorithm selector. The task of this missing key component is the construction of appropriate chains of algorithms to yield the users requirements. This approach is a crucial step towards a scientifically safe use of much-needed lossy data compression, because it disentangles the tasks of determining scientific ground characteristics of tolerable noise, from the task of determining an optimal compression strategy given target noise levels and constraints. Future algorithms are used without change in the application code, once they are integrated into SCIL. In this paper, we describe the user interfaces and quantities, two compression algorithms and evaluate SCIL’s ability for compressing climate data. This will show that the novel algorithms are competitive with state-of-the-art compressors ZFP and SZ and illustrate that the best algorithm depends on user settings and data properties.},
}

@inproceedings{AMIDFNPMOT17,
	author	 = {Julian Kunkel and Eugen Betke},
	title	 = {{An MPI-IO In-Memory Driver for Non-Volatile Pooled Memory of the Kove XPD}},
	year	 = {2017},
	booktitle	 = {{High Performance Computing: ISC High Performance 2017 International Workshops, DRBSD, ExaComm, HCPM, HPC-IODC, IWOPH, IXPUG, P^3MA, VHPC, Visualization at Scale, WOPSSS}},
	editor	 = {Julian Kunkel and Rio Yokota and Michaela Taufer and John Shalf},
	publisher	 = {Springer},
	series	 = {Lecture Notes in Computer Science},
	number	 = {10524},
	pages	 = {644--655},
	conference	 = {ISC High Performance},
	location	 = {Frankfurt, Germany},
	isbn	 = {978-3-319-67629-6},
	doi	 = {https://doi.org/10.1007/978-3-319-67630-2_48},
	abstract	 = {Many scientific applications are limited by the performance offered by parallel file systems. SSD based burst buffers provide significant better performance than HDD backed storage but at the expense of capacity. Clearly, achieving wire-speed of the interconnect and predictable low latency I/O is the holy grail of storage. In-memory storage promises to provide optimal performance exceeding SSD based solutions. Kove R ’s XPD R offers pooled memory for cluster systems. This remote memory is asynchronously backed up to storage devices of the XPDs and considered to be non-volatile. Albeit the system offers various APIs to access this memory such as treating it as a block device, it does not allow to expose it as file system that offers POSIX or MPI-IO semantics. In this paper, we 1) describe the XPD-MPIIO-driver which supports the scale-out architecture of the XPDs. This MPI-agnostic driver enables high-level libraries to utilize the XPD’s memory as storage. 2) A thorough performance evaluation of the XPD is conducted. This includes scaleout testing of the infrastructure and metadata operations but also performance variability. We show that the driver and storage architecture is able to nearly saturate wire-speed of Infiniband (60+ GiB/s with 14 FDR links) while providing low latency and little performance variability.},
}

@inproceedings{ANSRAKFFTC17,
	author	 = {Raul Torres and Julian Kunkel and Manuel Dolz and Thomas Ludwig},
	title	 = {{A Novel String Representation and Kernel Function for the Comparison of I/O Access Patterns}},
	year	 = {2017},
	booktitle	 = {{International Conference on Parallel Computing Technologies}},
	editor	 = {Victor Malyshkin},
	publisher	 = {Springer},
	series	 = {Lecture Notes in Computer Science},
	number	 = {10421},
	pages	 = {500--512},
	conference	 = {PaCT},
	location	 = {Nizhni Novgorod, Russia},
	isbn	 = {978-3-319-62932-2},
	doi	 = {https://doi.org/10.1007/978-3-319-62932-2_48},
	abstract	 = {Parallel I/O access patterns act as fingerprints of a parallel program. In order to extract meaningful information from these patterns, they have to be represented appropriately. Due to the fact that string objects can be easily compared using Kernel Methods, a conversion to a weighted string representation is proposed in this paper, together with a novel string kernel function called Kast Spectrum Kernel. The similarity matrices, obtained after applying the mentioned kernel over a set of examples from a real application, were analyzed using Kernel Principal Component Analysis (Kernel PCA) and Hierarchical Clustering. The evaluation showed that 2 out of 4 I/O access pattern groups were completely identified, while the other 2 conformed a single cluster due to the intrinsic similarity of their members. The proposed strategy can be promisingly applied to other similarity problems involving tree-like structured data.},
}

@inproceedings{ADPUSSTIOS16,
	author	 = {Julian Kunkel},
	title	 = {{Analyzing Data Properties using Statistical Sampling Techniques -- Illustrated on Scientific File Formats and Compression Features}},
	year	 = {2016},
	month	 = {06},
	booktitle	 = {{High Performance Computing: ISC High Performance 2016 International Workshops, ExaComm, E-MuCoCoS, HPC-IODC, IXPUG, IWOPH, P3MA, VHPC, WOPSSS}},
	editor	 = {Michela Taufer and Bernd Mohr and Julian Kunkel},
	publisher	 = {Springer},
	series	 = {Lecture Notes in Computer Science},
	number	 = {9945 2016},
	pages	 = {130--141},
	conference	 = {ISC-HPC 2017},
	location	 = {Frankfurt, Germany},
	isbn	 = {978-3-319-46079-6},
	doi	 = {https://doi.org/10.1007/978-3-319-46079-6_10},
	abstract	 = {Understanding the characteristics of data stored in data centers helps computer scientists in identifying the most suitable storage infrastructure to deal with these workloads. For example, knowing the relevance of file formats allows optimizing the relevant formats but also helps in a procurement to define benchmarks that cover these formats. Existing studies that investigate performance improvements and techniques for data reduction such as deduplication and compression operate on a small set of data. Some of those studies claim the selected data is representative and scale their result to the scale of the data center. One hurdle of running novel schemes on the complete data is the vast amount of data stored and, thus, the resources required to analyze the complete data set. Even if this would be feasible, the costs for running many of those experiments must be justified. This paper investigates stochastic sampling methods to compute and analyze quantities of interest on file numbers but also on the occupied storage space. It will be demonstrated that on our production system, scanning 1 \% of files and data volume is sufficient to deduct conclusions. This speeds up the analysis process and reduces costs of such studies significantly. The contributions of this paper are: (1) the systematic investigation of the inherent analysis error when operating only on a subset of data, (2) the demonstration of methods that help future studies to mitigate this error, (3) the illustration of the approach on a study for scientific file types and compression for a data center.},
}

@inproceedings{PPONIWMLKB15,
	author	 = {Julian Kunkel and Eugen Betke and Michaela Zimmer},
	title	 = {{Predicting Performance of Non-Contiguous I/O with Machine Learning}},
	year	 = {2015},
	booktitle	 = {{High Performance Computing, 30th International Conference, ISC High Performance 2015}},
	editor	 = {Julian Martin Kunkel and Thomas Ludwig},
	series	 = {Lecture Notes in Computer Science},
	number	 = {9137},
	pages	 = {257--273},
	conference	 = {ISC High Performance},
	location	 = {Frankfurt},
	issn	 = {0302-9743},
	doi	 = {https://doi.org/10.1007/978-3-319-20119-1_19},
}

@inproceedings{FILFRISLK14,
	author	 = {Jakob Lüttgau and Julian Kunkel},
	title	 = {{Feign: In-Silico Laboratory for Researching I/O Strategies}},
	year	 = {2014},
	booktitle	 = {{Parallel Data Storage Workshop (PDSW), 2014 9th}},
	pages	 = {43--48},
	conference	 = {SC14},
	location	 = {New Orleans},
	url	 = {http://www.pdsw.org/pdsw14/papers/pdsw14_p43.pdf},
}

@inproceedings{PPONIWMLKB14,
	author	 = {Julian Kunkel and Eugen Betke and Michaela Zimmer},
	title	 = {{Predicting Performance of Non-Contiguous I/O with Machine Learning}},
	year	 = {2014},
	booktitle	 = {{Parallel Data Storage Workshop (PDSW),  Work in progress session}},
	pages	 = {43--48},
	conference	 = {SC14},
	location	 = {New Orleans},
	url	 = {http://www.pdsw.org/pdsw14/wips/kunkel-wip-pdsw14.pdf},
}

@inproceedings{ACOTCMFMPA14,
	author	 = {Alvaro Aguilera and Holger Mickler and Julian Kunkel and Michaela Zimmer and Marc Wiedemann and Ralph Müller-Pfefferkorn},
	title	 = {{A Comparison of Trace Compression Methods for Massively Parallel Applications in Context of the SIOX Project}},
	year	 = {2014},
	booktitle	 = {{Tools for High Performance Computing 2013}},
	pages	 = {91--105},
	isbn	 = {978-3-319-08143-4},
}

@inproceedings{TSACAMAOOP14,
	author	 = {Julian Kunkel and Michaela Zimmer and Nathanael Hübbe and Alvaro Aguilera and Holger Mickler and Xuan Wang and Andrij Chut and Thomas Bönisch and Jakob Lüttgau and Roman Michel and Johann Weging},
	title	 = {{The SIOX Architecture – Coupling Automatic Monitoring and Optimization of Parallel I/O}},
	year	 = {2014},
	booktitle	 = {{Supercomputing}},
	editor	 = {Julian Kunkel and Thomas Ludwig and Hans Meuer},
	publisher	 = {Lecture Notes in Computer Science},
	series	 = {Supercomputing},
	pages	 = {245--260},
	conference	 = {ISC'14},
	organization	 = {ISC events},
	location	 = {Leipzig},
	isbn	 = {978-3-319-07517-4},
	doi	 = {https://doi.org/10.1007/978-3-319-07518-1_16},
	abstract	 = {Performance analysis and optimization of high-performance I/O systems is a daunting task. Mainly, this is due to the overwhelmingly complex interplay of the involved hardware and software layers. The Scalable I/O for Extreme Performance (SIOX) project provides a versatile environment for monitoring I/O activities and learning from this information. The goal of SIOX is to automatically suggest and apply performance optimizations, and to assist in locating and diagnosing performance problems. In this paper, we present the current status of SIOX. Our modular architecture covers instrumentation of POSIX, MPI and other high-level I/O libraries; the monitoring data is recorded asynchronously into a global database, and recorded traces can be visualized. Furthermore, we offer a set of primitive plug-ins with additional features to demonstrate the flexibility of our architecture: A surveyor plug-in to keep track of the observed spatial access patterns; an fadvise plug-in for injecting hints to achieve read-ahead for strided access patterns; and an optimizer plug-in which monitors the performance achieved with different MPI-IO hints, automatically supplying the best known hint-set when no hints were explicitly set. The presentation of the technical status is accompanied by a demonstration of some of these features on our 20 node cluster. In additional experiments, we analyze the overhead for concurrent access, for MPI-IO’s 4-levels of access, and for an instrumented climate application. While our prototype is not yet full-featured, it demonstrates the potential and feasibility of our approach.},
}

@inproceedings{IDADLFCMTL13,
	author	 = {Raul Torres and Leonidas Lindarkis and Julian Kunkel and Thomas Ludwig},
	title	 = {{ICON DSL: A Domain-Specific Language for climate modeling}},
	year	 = {2013},
	month	 = {11},
	booktitle	 = {{WOLFHPC 2013 Third International Workshop on Domain-Specific Languages and High-Level Frameworks for High Performance Computing}},
	conference	 = {SC13},
	location	 = {Denver},
	url	 = {http://sc13.supercomputing.org/sites/default/files/WorkshopsArchive/pdfs/wp127s1.pdf},
}

@inproceedings{USTVPOMIK13,
	author	 = {Julian Kunkel},
	title	 = {{Using Simulation to Validate Performance of MPI(-IO) Implementations}},
	year	 = {2013},
	month	 = {06},
	booktitle	 = {{Supercomputing}},
	editor	 = {Julian Martin Kunkel and Thomas Ludwig and Hans Werner Meuer},
	publisher	 = {Springer},
	address	 = {Berlin, Heidelberg},
	series	 = {Lecture Notes in Computer Science},
	number	 = {7905},
	pages	 = {181--195},
	conference	 = {ISC 2013},
	location	 = {Leipzig, Germany},
	isbn	 = {978-3-642-38749-4},
	issn	 = {0302-9743},
	doi	 = {https://doi.org/10.1007/978-3-642-38750-0_14},
	abstract	 = {Parallel file systems and MPI implementations aim to exploit available hardware resources in order to achieve optimal performance. Since performance is influenced by many hardware and software factors, achieving optimal performance is a daunting task. For these reasons, optimized communication and I/O algorithms are still subject to research. While complexity of collective MPI operations is discussed in literature sometimes, theoretic assessment of the measurements is de facto non-existent. Instead, conducted analysis is typically limited to performance comparisons to previous algorithms. However, observable performance is not only determined by the quality of an algorithm. At run-time performance could be degraded due to unexpected implementation issues and triggered hardware and software exceptions. By applying a model that resembles the system, simulation allows us to estimate the performance. With this approach, the non-function requirement for performance of an implementation can be validated and run-time inefficiencies can be localized. In this paper we demonstrate how simulation can be applied to assess observed performance of collective MPI calls and parallel IO. PIOsimHD, an event-driven simulator, is applied to validate observed performance on our 10 node cluster. The simulator replays recorded application activity and point-to-point operations of collective operations. It also offers the option to record trace files for visual comparison to recorded behavior. With the innovative introspection into behavior, several bottlenecks in system and implementation are localized.},
}

@inproceedings{ELCOCDHWKL13,
	author	 = {Nathanael Hübbe and Al Wegener and Julian Kunkel and Yi Ling and Thomas Ludwig},
	title	 = {{Evaluating Lossy Compression on Climate Data}},
	year	 = {2013},
	month	 = {06},
	booktitle	 = {{Supercomputing}},
	editor	 = {Julian Martin Kunkel and Thomas Ludwig and Hans Werner Meuer},
	publisher	 = {Springer},
	address	 = {Berlin, Heidelberg},
	series	 = {Lecture Notes in Computer Science},
	number	 = {7905},
	pages	 = {343--356},
	conference	 = {ISC 2013},
	location	 = {Leipzig, Germany},
	isbn	 = {978-3-642-38749-4},
	issn	 = {0302-9743},
	doi	 = {https://doi.org/10.1007/978-3-642-38750-0_26},
	abstract	 = {While the amount of data used by today’s high-performance computing (HPC) codes is huge, HPC users have not broadly adopted data compression techniques, apparently because of a fear that compression will either unacceptably degrade data quality or that compression will be too slow to be worth the effort. In this paper, we examine the effects of three lossy compression methods (GRIB2 encoding, GRIB2 using JPEG 2000 and LZMA, and the commercial Samplify APAX algorithm) on decompressed data quality, compression ratio, and processing time. A careful evaluation of selected lossy and lossless compression methods is conducted, assessing their influence on data quality, storage requirements and performance. The differences between input and decoded datasets are described and compared for the GRIB2 and APAX compression methods. Performance is measured using the compressed file sizes and the time spent on compression and decompression. Test data consists both of 9 synthetic data exposing compression behavior and 123 climate variables output from a climate model. The benefits of lossy compression for HPC systems are described and are related to our findings on data quality.},
}

@inproceedings{TSIHIZKL13,
	author	 = {Michaela Zimmer and Julian Kunkel and Thomas Ludwig},
	title	 = {{Towards Self-optimization in HPC I/O}},
	year	 = {2013},
	month	 = {06},
	booktitle	 = {{Supercomputing}},
	editor	 = {Julian Martin Kunkel and Thomas Ludwig and Hans Werner Meuer},
	publisher	 = {Springer},
	address	 = {Berlin, Heidelberg},
	series	 = {Lecture Notes in Computer Science},
	number	 = {7905},
	pages	 = {422--434},
	conference	 = {ISC 2013},
	location	 = {Leipzig, Germany},
	isbn	 = {978-3-642-38749-4},
	issn	 = {0302-9743},
	doi	 = {https://doi.org/10.1007/978-3-642-38750-0_32},
	abstract	 = {Performance analysis and optimization of high-performance I/O systems is a daunting task. Mainly, this is due to the overwhelmingly complex interplay of internal processes while executing application programs. Unfortunately, there is a lack of monitoring tools to reduce this complexity to a bearable level. For these reasons, the project Scalable I/O for Extreme Performance (SIOX) aims to provide a versatile environment for recording system activities and learning from this information. While still under development, SIOX will ultimately assist in locating and diagnosing performance problems and automatically suggest and apply performance optimizations.The SIOX knowledge path is concerned with the analysis and utilization of data describing the cause-and-effect chain recorded via the monitoring path. In this paper, we present our refined modular design of the knowledge path. This includes a description of logical components and their interfaces, details about extracting, storing and retrieving abstract activity patterns, a concept for tying knowledge to these patterns, and the integration of machine learning. Each of these tasks is illustrated through examples. The feasibility of our design is further demonstrated with an internal component for anomaly detection, permitting intelligent monitoring to limit the SIOX system’s impact on system resources.},
}

@inproceedings{ASODDIHSSM12,
	author	 = {Dirk Meister and Jürgen Kaiser and Andre Brinkmann and Michael Kuhn and Julian Kunkel and Toni Cortes},
	title	 = {{A Study on Data Deduplication in HPC Storage Systems}},
	year	 = {2012},
	month	 = {11},
	booktitle	 = {{Proceedings of the ACM/IEEE Conference on High Performance Computing (SC)}},
	publisher	 = {IEEE Computer Society},
	address	 = {Washington, DC, USA},
	conference	 = {SC'12},
	location	 = {Salt Lake City, USA},
}

@inproceedings{SCPAEICMSK12,
	author	 = {Sandra Schröder and Michael Kuhn and Nathanael Hübbe and Julian Kunkel and Timo Minartz and Petra Nerge and Florens Wasserfall and Thomas Ludwig},
	title	 = {{Scientific Computing: Performance and Efficiency in Climate Models}},
	year	 = {2012},
	booktitle	 = {{Proceedings of the Work in Progress Session, 20th Euromicro International Conference on Parallel, Distributed and Network-Based Processing}},
	editor	 = {Erwin Grosspietsch and Konrad Klöckner},
	publisher	 = {Institute for Systems Engineering and Automation},
	address	 = {Johannes Kepler University Linz},
	series	 = {SEA-Publications},
	number	 = {31},
	conference	 = {PDP 2012},
	organization	 = {Munich Network Management Team},
	location	 = {Garching, Germany},
	isbn	 = {978-3-902457-31-8},
}

@inproceedings{SAASIWPKL12,
	author	 = {Julian Kunkel and Thomas Ludwig},
	title	 = {{Simulating Application and System Interaction with PIOsimHD}},
	year	 = {2012},
	booktitle	 = {{Proceedings of the Work in Progress Session, 20th Euromicro International Conference on Parallel, Distributed and Network-Based Processing}},
	editor	 = {Erwin Grosspietsch and Konrad Klöckner},
	publisher	 = {Institute for Systems Engineering and Automation},
	address	 = {Johannes Kepler University Linz},
	series	 = {SEA-Publications},
	number	 = {31},
	conference	 = {PDP 2012},
	organization	 = {Munich Network Management Team},
	location	 = {Garching, Germany},
	isbn	 = {978-3-902457-31-8},
}

@inproceedings{SPEOSIOKKL12,
	author	 = {Michael Kuhn and Julian Kunkel and Thomas Ludwig},
	title	 = {{Simulation-Aided Performance Evaluation of Server-Side Input/Output Optimizations}},
	year	 = {2012},
	booktitle	 = {{20th Euromicro International Conference on Parallel, Distributed and Network-Based Processing}},
	editor	 = {Rainer Stotzka and Michael Schiffers and Yiannis Cotronis},
	publisher	 = {IEEE Computer Society},
	address	 = {Los Alamitos, Washington, Tokyo},
	pages	 = {562--566},
	conference	 = {PDP 2012},
	organization	 = {Munich Network Management Team},
	location	 = {Garching, Germany},
	isbn	 = {978-0-7695-4633-9},
	issn	 = {1066-6192},
	abstract	 = {The performance of parallel distributed file systems suffers from many clients executing a large number of operations in parallel, because the I/O subsystem can be easily overwhelmed by the sheer amount of incoming I/O operations. Many optimizations exist that try to alleviate this problem. Client-side optimizations perform preprocessing to minimize the amount of work the file servers have to do. Server-side optimizations use server-internal knowledge to improve performance. The HDTrace framework contains components to simulate, trace and visualize applications. It is used as a testbed to evaluate optimizations that could later be implemented in real-life projects. This paper compares existing client-side optimizations and newly implemented server-side optimizations and evaluates their usefulness for I/O patterns commonly found in HPC. Server-directed I/O chooses the order of non-contiguous I/O operations and tries to aggregate as many operations as possible to decrease the load on the I/O subsystem and improve overall performance. The results show that server-side optimizations beat client-side optimizations in terms of performance for many use cases. Integrating such optimizations into parallel distributed file systems could alleviate the need for sophisticated client-side optimizations. Due to their additional knowledge of internal workflows server-side optimizations may be better suited to provide high performance in general.},
}

@inproceedings{IMTIPWAFRO12,
	author	 = {Julian Kunkel and Thomas Ludwig},
	title	 = {{IOPm -- Modeling the I/O Path with a Functional Representation of Parallel File System and Hardware Architecture}},
	year	 = {2012},
	booktitle	 = {{20th Euromicro International Conference on Parallel, Distributed and Network-Based Processing}},
	editor	 = {Rainer Stotzka and Michael Schiffers and Yiannis Cotronis},
	publisher	 = {IEEE Computer Society},
	address	 = {Los Alamitos, Washington, Tokyo},
	pages	 = {554--561},
	conference	 = {PDP 2012},
	organization	 = {Munich Network Management Team},
	location	 = {Garching, Germany},
	isbn	 = {978-0-7695-4633-9},
	issn	 = {1066-6192},
	abstract	 = {The I/O path model (IOPm) is a graphical representation of the architecture of parallel file systems and the machine they are deployed on. With help of IOPm, file system and machine configurations can be quickly analyzed and distinguished from each other. Contrary to typical representations of the machine and file system architecture, the model visualizes the data or meta data path of client access. Abstract functionality of hardware components such as client and server nodes is covered as well as software aspects such as high-level I/O libraries, collective I/O and caches. Redundancy could be represented, too. Besides the advantage of a standardized representation for analysis IOPm assists to identify and communicate bottlenecks in the machine and file system configuration by highlighting performance relevant functionalities. By abstracting functionalities from the components they are hosted on, IOPm will enable to build interfaces to monitor file system activity.},
}

@inproceedings{TPOMWPACSO10,
	author	 = {Yuichi Tsujita and Julian Kunkel and Stephan Krempel and Thomas Ludwig},
	title	 = {{Tracing Performance of MPI-I/O with PVFS2: A Case Study of Optimization}},
	year	 = {2010},
	booktitle	 = {{Parallel Computing: From Multicores and GPU's to Petascale}},
	publisher	 = {IOS Press},
	pages	 = {379--386},
	conference	 = {PARCO 2009},
	isbn	 = {978-1-60750-530-3},
	doi	 = {https://doi.org/10.3233/978-1-60750-530-3-379},
	abstract	 = {Parallel computing manages huge amounts of data due to a dramatic increase in computing scale. The parallel file system PVFS version 2 (PVFS2) realizes a scalable file system for such huge data on a cluster system. Although several MPI tracing tools can check the behavior of MPI functions, tracing PVFS server activities has not been available. Hence, we have missed chances to optimize MPI applications regarding PVFS server activities although effective usage of limited resources is important even in PVFS servers. An off-line performance analysis tool named PIOviz traces both MPI-I/O calls and associated PVFS server activities to assist optimization for MPI applications. Besides, tracing statistical values of PVFS servers such as CPU usage and PVFS internal statistics assists optimization of MPI applications. In this paper, we demonstrate two performance evaluation tests of the HPIO benchmark, and carry out off-line analysis by using PIOviz. The evaluation shows effectiveness of PIOviz in detecting bottlenecks of MPI-I/O.},
	url	 = {http://ebooks.iospress.nl/volumearticle/26413},
}

@inproceedings{OFTCIKKTML12,
	author	 = {Michael Kuhn and Julian Kunkel and Yuichi Tsujita and Hidetaka Muguruma and Thomas Ludwig},
	title	 = {{Optimizations for Two-Phase Collective I/O}},
	year	 = {2012},
	booktitle	 = {{Applications, Tools and Techniques on the Road to Exascale Computing}},
	editor	 = {Koen De Bosschere and Erik H. D'Hollander and Gerhard R. Joubert and David Padua and Frans Peters},
	publisher	 = {IOS Press},
	address	 = {Amsterdam, Berlin, Tokyo, Washington DC},
	series	 = {Advances in Parallel Computing},
	number	 = {22},
	pages	 = {455--462},
	conference	 = {ParCo 2011},
	organization	 = {University of Ghent, ELIS Department},
	location	 = {Ghent, Belgium},
	isbn	 = {978-1-61499-040-6},
	issn	 = {0927-5452},
	abstract	 = {The performance of parallel distributed file systems suffers from many clients executing a large number of operations in parallel, because the I/O subsystem can be easily overwhelmed by the sheer amount of incoming I/O operations. This, in turn, can slow down the whole distributed system. Many optimizations exist that try to alleviate this problem. Client-side optimizations perform preprocessing to minimize the amount of work the file servers have to do. Server-side optimizations use server-internal knowledge to improve performance. This paper provides an overview of existing client-side optimizations and presents new modifications of the Two-Phase protocol. Interleaved Two-Phase is a modification of ROMIO's Two-Phase protocol, which iterates over the file differently to reduce the number of seek operations on disk. Pipelined Two-Phase uses a pipelined scheme which overlaps I/O and communication phases to utilize the network and I/O subsystems concurrently.},
}

@inproceedings{VOMDKL12,
	author	 = {Julian Kunkel and Thomas Ludwig},
	title	 = {{Visualization of MPI(-IO) Datatypes}},
	year	 = {2012},
	booktitle	 = {{Applications, Tools and Techniques on the Road to Exascale Computing}},
	editor	 = {Koen De Bosschere and Erik H. D'Hollander and Gerhard R. Joubert and David Padua and Frans Peters},
	publisher	 = {IOS Press},
	address	 = {Amsterdam, Berlin, Tokyo, Washington DC},
	series	 = {Advances in Parallel Computing},
	number	 = {22},
	pages	 = {473--480},
	conference	 = {ParCo 2011},
	organization	 = {University of Ghent, ELIS Department},
	location	 = {Ghent, Belgium},
	isbn	 = {978-1-61499-040-6},
	issn	 = {0927-5452},
	abstract	 = {To permit easy and efficient access to non-contiguous regions in memory for communication and I/O the message passing interface offers nested datatypes. Since nested datatypes can be very complicated, the understanding of non-contiguous access patterns and the debugging of wrongly accessed memory regions is hard for the developer. HDTrace is an environment which allows to trace the behavior of MPI programs and to simulate them for arbitrary  virtual cluster configuration. It is designed to record all MPI parameters including MPI datatypes. In this paper we present the capabilities to visualize usage of derived datatypes for communication and I/O accesses -- a simple hierarchical view is introduced which presents them in a compact form and allows to dig into the nested datatypes. File regions accessed in non-contiguous I/O calls can be visualized in terms of the original datatype. The presented feature assists developers in understanding the datatype layout and spatial I/O access patterns of their application.},
}

@inproceedings{UFDAAEESAM09,
	author	 = {Olga Mordvinova and Julian Kunkel and Christian Baun and Thomas Ludwig and Marcel Kunze},
	title	 = {{USB Flash Drives as an Energy Efficiency Storage Alternative}},
	year	 = {2009},
	month	 = {10},
	booktitle	 = {{Proceedings of the 10th IEEE/ACM International Conference on Grid Computing}},
	publisher	 = {IEEE Computer Society},
	address	 = {Washington, DC, USA},
	pages	 = {175--182},
	conference	 = {GRID-09},
	organization	 = {IEEE/ACM},
	location	 = {Banff, Alberta, Canada},
	isbn	 = {978-1-4244-5148-7},
	doi	 = {https://doi.org/10.1109/GRID.2009.5353062},
}

@inproceedings{UNIOIHPCTR09,
	author	 = {David Buettner and Julian Kunkel and Thomas Ludwig},
	title	 = {{Using Non-blocking I/O Operations in High Performance Computing to Reduce Execution Times}},
	year	 = {2009},
	booktitle	 = {{Proceedings of the 16th European PVM/MPI Users' Group Meeting on Recent Advances in Parallel Virtual Machine and Message Passing Interface}},
	publisher	 = {Springer-Verlag},
	address	 = {Berlin, Heidelberg},
	pages	 = {134--142},
	conference	 = {EuroPVM/MPI-09},
	organization	 = {CSC - IT},
	location	 = {Espoo, Finland},
	isbn	 = {978-3-642-03769-6},
	doi	 = {https://doi.org/10.1007/978-3-642-03770-2_20},
	abstract	 = {As supercomputers become faster, the I/O part of applications can become a real problem in regard to overall execution times. System administrators and developers of hardware or software components reduce execution times by creating new and optimized parts for the supercomputers. While this helps a lot in the struggle to minimize I/O times, adjustment of the execution environment is not the only option to improve overall application behavior. In this paper we examine if the application programmer can also contribute by making use of non-blocking I/O operations. After an analysis of non-blocking I/O operations and their potential for shortening execution times we present a benchmark which was created and run in order to see if the theoretical promises also hold in practice.},
	url	 = {http://www.springerlink.com/content/h7546112q656218l/fulltext.pdf},
}

@inproceedings{SAIPFSCLRV09,
	author	 = {Philip Carns and Sam Lang and Robert Ross and Murali Vilayannur and Julian Kunkel and Thomas Ludwig},
	title	 = {{Small-file Access in Parallel File Systems}},
	year	 = {2009},
	booktitle	 = {{IPDPS '09: Proceedings of the 2009 IEEE International Symposium on Parallel and Distributed Processing}},
	publisher	 = {IEEE Computer Society},
	address	 = {Washington, DC, USA},
	pages	 = {1--11},
	conference	 = {IPDPS-09},
	organization	 = {University of Rome},
	location	 = {Rome, Italy},
	isbn	 = {978-1-4244-3751-1},
	doi	 = {https://doi.org/10.1109/IPDPS.2009.5161029},
	abstract	 = {Today's computational science demands have resulted in ever larger parallel computers, and storage systems have grown to match these demands. Parallel file systems used in this environment are increasingly specialized to extract the highest possible performance for large I/O operations, at the expense of other potential workloads. While some applications have adapted to I/O best practices and can obtain good performance on these systems, the natural I/O patterns of many applications result in generation of many small files. These applications are not well served by current parallel file systems at very large scale. This paper describes five techniques for optimizing small-file access in parallel file systems for very large scale systems. These five techniques are all implemented in a single parallel file system (PVFS) and then systematically assessed on two test platforms. A microbenchmark and the mdtest benchmark are used to evaluate the optimizations at an unprecedented scale. We observe as much as a 905\% improvement in small-file create rates, 1,106\% improvement in small-file stat rates, and 727\% improvement in small-file removal rates, compared to a baseline PVFS configuration on a leadership computing platform using 16,384 cores.},
	url	 = {http://www.mcs.anl.gov/uploads/cels/papers/P1571.pdf},
}

@inproceedings{TICIMAMKTM09,
	author	 = {Julian Kunkel and Yuichi Tsujita and Olga Mordvinova and Thomas Ludwig},
	title	 = {{Tracing Internal Communication in MPI and MPI-I/O}},
	year	 = {2009},
	month	 = {12},
	booktitle	 = {{International Conference on Parallel and Distributed Computing, Applications and Technologies, PDCAT}},
	publisher	 = {IEEE Computer Society},
	address	 = {Washington, DC, USA},
	pages	 = {280--286},
	conference	 = {PDCAT-09},
	organization	 = {Hiroshima University},
	location	 = {Higashi Hiroshima, Japan},
	isbn	 = {978-0-7695-3914-0},
	doi	 = {https://doi.org/10.1109/PDCAT.2009.9},
	abstract	 = {MPI implementations can realize MPI operations with any algorithm that fulfills the specified semantics. To provide optimal efficiency the MPI implementation might choose the algorithm dynamically, depending on the parameters given to the function call. However, this selection is not transparent to the user. While this abstraction is appropriate for common users, achieving best performance with fixed parameter sets requires knowledge of internal processing. Also, for developers of collective operations it might be useful to understand timing issues inside the communication or I/O call. In this paper we extended the PIOviz environment to trace MPI internal communication. Thus, this allows the user to see PVFS server behavior together with the behavior in the MPI application and inside MPI itself. We present some analysis results for these capabilities for MPICH2 on a Beowulf Cluster},
}

@inproceedings{BDIPFSWTPM08,
	author	 = {Julian Kunkel and Thomas Ludwig},
	title	 = {{Bottleneck Detection in Parallel File Systems with Trace-Based Performance Monitoring}},
	year	 = {2008},
	booktitle	 = {{Euro-Par '08: Proceedings of the 14th international Euro-Par conference on Parallel Processing}},
	publisher	 = {Springer-Verlag},
	address	 = {Berlin, Heidelberg},
	pages	 = {212--221},
	conference	 = {Euro-Par-08},
	organization	 = {University of Las Palmas de Gran Canaria},
	location	 = {Las Palmas de Gran Canaria, Spain},
	isbn	 = {978-3-540-85450-0},
	doi	 = {https://doi.org/10.1007/978-3-540-85451-7_23},
	abstract	 = {Today we recognize a high demand for powerful storage. In industry this issue is tackled either with large storage area networks, or by deploying parallel file systems on top of RAID systems or on smaller storage networks. The bigger the system gets the more important is the ability to analyze the performance and to identify bottlenecks in the architecture and the applications. We extended the performance monitor available in the parallel file system PVFS2 by including statistics of the server process and information of the system. Performance monitor data is available during runtime and the server process was modified to store this data in off-line traces suitable for post-mortem analysis. These values can be used to detect bottlenecks in the system. Some measured results demonstrate how these help to identify bottlenecks and may assists to rank the servers depending on their capabilities},
	url	 = {http://www.springerlink.com/content/5rl1j3j05164608g/fulltext.pdf},
}

@inproceedings{DMOFSFIPKK08,
	author	 = {Michael Kuhn and Julian Kunkel and Thomas Ludwig},
	title	 = {{Directory-Based Metadata Optimizations for Small Files in PVFS}},
	year	 = {2008},
	booktitle	 = {{Euro-Par '08: Proceedings of the 14th international Euro-Par conference on Parallel Processing}},
	publisher	 = {Springer-Verlag},
	address	 = {Berlin, Heidelberg},
	pages	 = {90--99},
	conference	 = {Euro-Par-08},
	organization	 = {University of Las Palmas de Gran Canaria},
	location	 = {Las Palmas de Gran Canaria, Spain},
	isbn	 = {978-3-540-85450-0},
	doi	 = {https://doi.org/10.1007/978-3-540-85451-7_11},
	abstract	 = {Modern file systems maintain extensive metadata about stored files. While this usually is useful, there are situations when the additional overhead of such a design becomes a problem in terms of performance. This is especially true for parallel and cluster file systems, because due to their design every metadata operation is even more expensive. In this paper several changes made to the parallel cluster file system PVFS are presented. The changes are targeted at the optimization of workloads with large numbers of small files. To improve metadata performance, PVFS was modified such that unnecessary metadata is not managed anymore. Several tests with a large quantity of files were done to measure the benefits of these changes. The tests have shown that common file system operations can be sped up by a factor of two even with relatively few changes.},
}

@inproceedings{PEOTPAKL07,
	author	 = {Julian Kunkel and Thomas Ludwig},
	title	 = {{Performance Evaluation of the PVFS2 Architecture}},
	year	 = {2007},
	booktitle	 = {{PDP '07: Proceedings of the 15th Euromicro International Conference on Parallel, Distributed and Network-Based Processing}},
	publisher	 = {IEEE Computer Society},
	address	 = {Washington, DC, USA},
	pages	 = {509--516},
	conference	 = {PDP-07},
	organization	 = {Euromicro},
	location	 = {Napoli, Italy},
	isbn	 = {0-7695-2784-1},
	doi	 = {https://doi.org/10.1109/PDP.2007.65},
	abstract	 = {As the complexity of parallel file systems? software stacks increases it gets harder to reveal the reasons for performance bottlenecks in these software layers. This paper introduces a method which eliminates the influence of the physical storage on performance analysis in order to find these bottlenecks. Also, the influence of the hardware components on the performance is modeled to estimate the maximum achievable performance of a parallel file system. The paper focusses on the Parallel Virtual File System 2 (PVFS2) and shows results for the functionality file creation, small contiguous I/O requests and large contiguous I/O requests.},
}

@inproceedings{AOTMOLWTPJ07,
	author	 = {Thomas Ludwig and Stephan Krempel and Michael Kuhn and Julian Kunkel and Christian Lohse},
	title	 = {{Analysis of the MPI-IO Optimization Levels with the PIOViz Jumpshot Enhancement}},
	year	 = {2007},
	booktitle	 = {{Recent Advances in Parallel Virtual Machine and Message Passing Interface}},
	editor	 = {Franck Cappello and Thomas Hérault and Jack Dongarra},
	publisher	 = {Springer},
	address	 = {Berlin / Heidelberg, Germany},
	series	 = {Lecture Notes in Computer Science},
	number	 = {4757},
	pages	 = {213--222},
	conference	 = {EuroPVM/MPI-07},
	organization	 = {Institut national de recherche en informatique et automatique},
	location	 = {Paris, France},
	isbn	 = {978-3-540-75415-2},
	doi	 = {https://doi.org/10.1007/978-3-540-75416-9_32},
	abstract	 = {With MPI-IO we see various alternatives for programming file I/O. The overall program performance depends on many different factors. A new trace analysis environment provides deeper insight into the client/server behavior and visualizes events of both process types. We investigate the influence of making independent vs. collective calls together with access to contiguous and non-contiguous data regions in our MPI-IO program. Combined client and server traces exhibit reasons for observed I/O performance.},
	url	 = {http://www.springerlink.com/content/p475547454373863/fulltext.pdf},
}

@inproceedings{TTMCDALKKP06,
	author	 = {Thomas Ludwig and Stephan Krempel and Julian Kunkel and Frank Panse and Dulip Withanage},
	title	 = {{Tracing the MPI-IO Calls' Disk Accesses}},
	year	 = {2006},
	booktitle	 = {{Recent Advances in Parallel Virtual Machine and Message Passing Interface}},
	editor	 = {Bernd Mohr and Jesper Larsson Träff and Joachim Worringen and Jack Dongarra},
	publisher	 = {Springer},
	address	 = {Berlin / Heidelberg, Germany},
	series	 = {Lecture Notes in Computer Science},
	number	 = {4192},
	pages	 = {322--330},
	conference	 = {EuroPVM/MPI-06},
	organization	 = {C\&C Research Labs, NEC Europe Ltd., and the Research Centre Jülich},
	location	 = {Bonn, Germany},
	isbn	 = {3-540-39110-X},
	doi	 = {https://doi.org/10.1007/11846802_45},
	abstract	 = {With parallel file I/O we are faced with the situation that we do not have appropriate tools to get an insight into the I/O server behavior depending on the I/O calls in the corresponding parallel MPI program. We present an approach that allows us to also get event traces from the I/O server environment and to merge them with the client trace. Corresponding events will be matched and visualized. We integrate this functionality into the parallel file system PVFS2 and the MPICH2 tool Jumpshot. Keywords: Performance Analyzer, Parallel I/O, Visualization, Trace-based Tools, PVFS2.},
	url	 = {http://www.springerlink.com/content/537j28201153t3n7/fulltext.pdf},
}

@inproceedings{TAAFUTGFLH16,
	author	 = {Christian Hovy and Julian Kunkel},
	title	 = {{Towards Automatic and Flexible Unit Test Generation for Legacy HPC Code}},
	year	 = {2016},
	booktitle	 = {{Proceedings of the Fourth International Workshop on Software Engineering for High Performance Computing in Computational Science and Engineering}},
	conference	 = {SEHPCCSE16},
	location	 = {Salt Lake City, Utah, USA},
	doi	 = {https://doi.org/10.1109/SE-HPCCSE.2016.005},
	abstract	 = {Unit testing is an established practice in professional software development. However, in high-performance computing (HPC) with its scientific applications, it is not widely applied. Besides general problems regarding testing of scientific software, for many HPC applications the effort of creating small test cases with a consistent set of test data is high.
    We have created a tool called FortranTestGenerator, that significantly reduces the effort of creating unit tests for subroutines of an existing Fortran application. It is based on Capture \& Replay (C\&R), that is, it extracts data while running the original application and uses the extracted data as test input data. The tool automatically generates code for capturing the input data and a basic test driver which can be extended by the developer to an appropriate unit test. A static source code analysis is conducted, to reduce the number of captured variables. Code is generated based on flexibly customizable templates. Thus, both the capturing process and the unit tests can easily be integrated into an existing software ecosystem.
    Since most HPC applications use message passing for parallel processing, we also present an approach to extend our C\&R model to MPI communication. This allows extraction of unit tests from massively parallel applications that can be run with a single process.},
}

@misc{TVIFIATIKL19,
	author	 = {Julian Kunkel and Jay Lofstead and John Bent and George Markomanolis},
	title	 = {{The Virtual Institute for I/O and the IO-500}},
	year	 = {2019},
	month	 = {06},
	location	 = {Frankfurt, Germany},
	activity	 = {ISC High Performance 2019},
	abstract	 = {The research community in high-performance computing is organized loosely. There are many distinct resources such as homepages of research groups and benchmarks. The Virtual Institute for I/O aims to provide a hub for the community and particularly newcomers to find relevant information in many directions. It hosts the comprehensive data center list (CDCL). Similarly to the top500, it contains information about supercomputers and their storage systems. I/O benchmarking, particularly, the intercomparison of measured performance between sites is tricky as there are more hardware components involved and configurations to take into account. Therefore, together with the community, we standardized an HPC I/O benchmark, the IO-500benchmark, for which the first list had been released during supercomputing in Nov. 2017. Such a benchmark is also useful to assess the impact of system issues like the Meltdown and Spectre* bugs. This poster introduces the Virtual Institute for I/O, the high-performance storage list and the effort for the IO-500 which are unfunded community projects.},
	url	 = {https://2019.isc-program.com/presentation/?id=proj105&sess=sess286},
}

@misc{PCHHHSBKKO19,
	author	 = {Kai Himstedt and Nathanael Hübbe and Sandra Schröder and Hendryk Bockelmann and Michael Kuhn and Julian Kunkel and Stephan Olbrich and Thomas Ludwig and Matthias Riebisch and Markus Stammberger and Hinnerk Stüben},
	title	 = {{Performance Conscious HPC (PeCoH) - 2019}},
	year	 = {2019},
	month	 = {06},
	location	 = {Frankfurt, Germany},
	activity	 = {ISC High Performance},
	abstract	 = {In PeCoH, we establish the Hamburg HPC Competence Center (HHCC) as a virtual institution, which coordinates and fosters joint performance engineering activities between the local compute centers DKRZ, RRZ and TUHH RZ. Together, we will implement user services to support performance engineering on a basic level and provide a basis for co-development, user education and dissemination of performance engineering concepts. In this poster we focus on performance awareness, software engineering for HPC, and the development of our HPC certification program. Project outputs and ongoing activities are presented.},
	url	 = {https://2019.isc-program.com/presentation/?id=proj112&sess=sess286},
}

@misc{ACAIMFESJK19,
	author	 = {Nabeeh Jumah and Julian Kunkel and Anastasiia Novikova and Thomas Ludwig and Thomas Dubos and Sunmin Park and Hisashi Yashiro and Günther Zängl and John Thuburn},
	title	 = {{Advanced Computation and I/O Methods for Earth-System Simulations (AIMES)}},
	year	 = {2019},
	month	 = {06},
	location	 = {Frankfurt, Germany},
	activity	 = {ISC High Performance},
	abstract	 = {The Advanced Computation and I/O Methods for Earth-System Simulations (AIMES) project addresses the key issues of programmability, computational efficiency and I/O limitations that are common in next-generation icosahedral earth-system models. Ultimately, the project is intended to foster development of best-practices and useful norms by cooperating on shared ideas and components. During the project, we will ensure that the developed concepts and tools are not only applicable for earth-science but for other scientific domains as well. In this poster we show the projects plan and progress and present some results.},
	url	 = {https://2019.isc-program.com/presentation/?id=proj104&sess=sess286},
}

@misc{IHCPKHFALG19,
	author	 = {Julian Kunkel and Kai Himstedt and Weronika Filinger and Jean-Thomas Acquaviva and Lev Lafayette and Anja Gerbes and Waseem Kamleh and Sharan Kalwan},
	title	 = {{International HPC Certification Program}},
	year	 = {2019},
	month	 = {06},
	location	 = {Frankfurt, Germany},
	activity	 = {ISC High Performance},
	abstract	 = {The HPC community has always considered the training of new and existing HPC practitioners to be of high importance to its growth. The significance of training will increase even further in the era of Exascale when HPC encompasses even more scientific disciplines. This diversification of HPC practitioners challenges the traditional training approaches, which are not able to satisfy the specific needs of users, often coming from non-traditionally HPC disciplines and only interested in learning a particular set of skills. HPC centres are struggling to identify and overcome the gaps in users’ knowledge. How should we support prospective and existing users who are not aware of their own knowledge gaps?  We are working towards the establishment of an International HPC Certification program that would clearly categorize, define and examine them similarly to a school curriculum.  Ultimately, we aim for the certificates to be recognized and respected by the HPC community and industry.},
	url	 = {https://2019.isc-program.com/presentation/?id=proj114&sess=sess286},
}

@misc{ACAIMFESKL18,
	author	 = {Julian Kunkel and Thomas Ludwig and Thomas Dubos and Naoya Maruyama and Takayuki Aoki and Günther Zängl and Hisashi Yashiro and Ryuji Yoshida and Hirofumi Tomita and Masaki Satoh and Yann Meurdesoif and Nabeeh Jumah and Anastasiia Novikova and Anja Gerbes},
	title	 = {{Advanced Computation and I/O Methods for Earth-System Simulations (AIMES)}},
	year	 = {2018},
	month	 = {06},
	location	 = {Frankfurt, Germany},
	activity	 = {ISC HPC},
	abstract	 = {The Advanced Computation and I/O Methods for Earth-System Simulations (AIMES) project addresses the key issues of programmability, computational efficiency and I/O limitations that are common in next-generation icosahedral earth-system models. Ultimately, the project is intended to foster development of best-practices and useful norms by cooperating on shared ideas and components. During the project, we will ensure that the developed concepts and tools are not only applicable for earth-science but for other scientific domains as well. In this poster we show the projects plan and progress during the first two years of the project lifecycle.},
	url	 = {https://2018.isc-program.com/?page_id=10&id=proj103&sess=sess144},
}

@misc{PCHHHSBKKL18,
	author	 = {Kai Himstedt and Nathanael Hübbe and Sandra Schröder and Hendryk Bockelmann and Michael Kuhn and Julian Kunkel and Thomas Ludwig and Stephan Olbrich and Matthias Riebisch and Markus Stammberger and Hinnerk Stüben},
	title	 = {{Performance Conscious HPC (PeCoH) - 2018}},
	year	 = {2018},
	month	 = {06},
	location	 = {Frankfurt, Germany},
	activity	 = {ISC HPC},
	abstract	 = {In PeCoH, we establish the Hamburg HPC Competence Center (HHCC) as a virtual institution, which coordinates and fosters joint performance engineering activities between the local compute centers DKRZ, RRZ and TUHH RZ. Together, we will implement user services to support performance engineering on a basic level and provide a basis for co-development, user education and dissemination of performance engineering concepts. We will evaluate methods to raise user awareness for performance engineering and bring them into production environments in order to tune standard software as well as individual software. Specifically, we address cost-awareness, provide representative success stories, and provide basic and advanced HPC knowledge as online content resulting in a certification system.},
	url	 = {https://2018.isc-program.com/?page_id=10&id=proj114&sess=sess144},
}

@misc{IHCPKHFAJL18,
	author	 = {Julian Kunkel and Kai Himstedt and Weronika Filinger and Jean-Thomas Acquaviva and William Jalby and Lev Lafayette},
	title	 = {{International HPC Certification Program}},
	year	 = {2018},
	month	 = {06},
	location	 = {Frankfurt, Germany},
	activity	 = {ISC HPC},
	abstract	 = {The HPC community has always considered the training of new and existing HPC practitioners to be of high importance to its growth. The significance of training will increase even further in the era of Exascale when HPC encompasses even more scientific disciplines. This diversification of HPC practitioners challenges the traditional training approaches, which are not able to satisfy the specific needs of users, often coming from non-traditionally HPC disciplines and only interested in learning a particular set of skills. HPC centres are struggling to identify and overcome the gaps in users’ knowledge. How should we support prospective and existing users who are not aware of their own knowledge gaps?  We are working towards the establishment of an International HPC Certification program that would clearly categorize, define and examine them similarly to a school curriculum.  Ultimately, we aim for the certificates to be recognized and respected by the HPC community and industry.},
	url	 = {https://2018.isc-program.com/?page_id=10&id=proj129&sess=sess144},
}

@misc{TVIFIATIKL18,
	author	 = {Julian Kunkel and Jay Lofstead and John Bent},
	title	 = {{The Virtual Institute for I/O and the IO-500}},
	year	 = {2018},
	month	 = {06},
	location	 = {Frankfurt, Germany},
	activity	 = {ISC HPC},
	abstract	 = {The research community in high-performance computing is organized loosely. There are many distinct resources such as homepages of research groups and benchmarks. The Virtual Institute for I/O aims to provide a hub for the community and particularly newcomers to find relevant information in many directions. It hosts the comprehensive data center list (CDCL). Similarly to the top500, it contains information about supercomputers and their storage systems. I/O benchmarking, particularly, the intercomparison of measured performance between sites is tricky as there are more hardware components involved and configurations to take into account. Therefore, together with the community, we standardized an HPC I/O benchmark, the IO-500 benchmark, for which the first list had been released during supercomputing in Nov. 2017. This poster introduces the Virtual Institute for I/O, the high-performance storage list and the effort for the IO-500 which are unfunded community projects.},
	url	 = {https://2018.isc-program.com/?page_id=10&id=proj101&sess=sess144},
}

@misc{AUGCTTFPPO18,
	author	 = {Nabeeh Jumah and Julian Kunkel},
	title	 = {{A user-controlled GGDML Code Translation Technique for Performance Portability of Earth System Models}},
	year	 = {2018},
	month	 = {06},
	location	 = {Frankfurt, Germany},
	activity	 = {ISC HPC},
	abstract	 = {Demand for high-performance computing is increasing in earth system modeling, and in natural sciences in general. Unfortunately, automatic optimizations done by compilers are not enough to make use of target machines' capabilities. Manual code adjustments are mandatory to exploit hardware capabilities. However, optimizing for one architecture, may degrade performance for other architectures. This loss of portability is a challenge. Our approach involves the use of the GGDML language extensions to write a higher-level modeling code, and use a user-controlled source-to-source translation technique. Translating the code results in an optimized version for the target machine. The contributions of this poster are: 1) The use of a highly-configurable code translation technique to transform higher-level code into target-machine-optimized code. 2) Evaluation of code transformation for multi-core and GPU based machines, both single and multi-node configurations},
	url	 = {https://2018.isc-program.com/?page_id=10&id=post104&sess=sess113},
}

@misc{APFCMGJK18,
	author	 = {Anja Gerbes and Nabeeh Jumah and Julian Kunkel},
	title	 = {{Automatic Profiling for Climate Modeling}},
	year	 = {2018},
	month	 = {04},
	location	 = {Bristol, United Kingdom},
	activity	 = {Euro LLVM},
	abstract	 = {Some applications are time consuming like climate modeling, which include lengthy simulations. Hence, coding is sensitive for performance. Spending more time on optimization of specific code parts can improve total performance. Profiling an application is a well-known technique to do that. Many tools are available for developers to get performance information about their code. With our provided python package Performance Analysis and Source-Code Instrumentation Toolsuite (PASCIT) is a automatic instrumentation of an user’s source code possible. Developers mark the parts that they need performance information about. We present an effort to profile climate modeling codes with two alternative methods: • usage of GGDML translation tool to mark directly the computational kernels of an application for profiling. • usage of GGDML translation tool to generate a serial code in a first step and then use LLVM/Clang to instrument some code parts with a profiler’s directives. The resulting codes are profiled with the LIKWID profiler. Alternatively, we use perf and OProfile’s ocount \& operf to measure hardware characteristics. The performance report with a visualization of the measured hardware performance counters in generating Radar Charts, Latex Tables, Box Plots are interesting for scientist to understand the bottlenecks of their codes.},
	url	 = {http://llvm.org/devmtg/2017-03//2017/02/20/accepted-sessions.html#42},
}

@misc{ATSFNAHLBP17,
	author	 = {Jakob Lüttgau and Eugen Betke and Olga Perevalova and Julian Kunkel and Michael Kuhn},
	title	 = {{Adaptive Tier Selection for NetCDF and HDF5}},
	year	 = {2017},
	month	 = {11},
	location	 = {Denver, CO, USA},
	activity	 = {SC17},
}

@misc{TDTSOCAFQC17,
	author	 = {Julian Kunkel and Anastasia Novikova and Eugen Betke},
	title	 = {{Toward Decoupling the Selection of Compression Algorithms from Quality Constraints}},
	year	 = {2017},
	month	 = {11},
	location	 = {Denver, CO, USA},
	activity	 = {SC17},
}

@misc{MASOTLFHSS16,
	author	 = {Jakob Lüttgau and Julian Kunkel},
	title	 = {{Modeling and Simulation of Tape Libraries for Hierarchical Storage Systems}},
	year	 = {2016},
	month	 = {11},
	location	 = {Salt Lake City, Utah, USA},
	activity	 = {SC16},
	url	 = {http://sc16.supercomputing.org/sc-archive/tech_poster/tech_poster_pages/post123.html},
}

@misc{UISFMKB16,
	author	 = {Julian Kunkel and Eugen Betke},
	title	 = {{Utilizing In-Memory Storage for MPI-IO}},
	year	 = {2016},
	month	 = {11},
	location	 = {Salt Lake City, Utah, USA},
	activity	 = {SC16},
	url	 = {http://sc16.supercomputing.org/sc-archive/tech_poster/tech_poster_pages/post128.html},
}

@misc{TVIFIATIKL17,
	author	 = {Julian Kunkel and Jay Lofstead and John Bent},
	title	 = {{The Virtual Institute for I/O and the IO-500}},
	year	 = {2017},
	month	 = {06},
	location	 = {Frankfurt, Germany},
	activity	 = {ISC High Performance 2017},
}

@misc{IMWGJKZYDM17,
	author	 = {Nabeeh Jumah and Julian Kunkel and Günther Zängl and Hisashi Yashiro and Thomas Dubos and Yann Meurdesoif},
	title	 = {{Icosahedral Modeling with GGDML}},
	year	 = {2017},
	month	 = {10},
	location	 = {Hamburg, Germany},
	activity	 = {DKRZ user workshop 2017},
	abstract	 = {The atmospheric and climate sciences and the natural sciences in general are increasingly demanding for higher performance computing. Unfortunately, the gap between the diversity of the hardware architectures that the manufacturers provide to fulfill the needs for performance and the scientific modeling can't be filled by the general-purpose languages and compilers. Scientists need to manually optimize their models to exploit the machine capabilities. This leads to code redundancies when targeting different machines. This is not trivial while considering heterogeneous computing as a basis for exascale computing.  In order to provide performance portability to the icosahedral climate modeling we have developed a set of higher-level language extensions we call GGDML. The extensions provide semantically-higher-level constructs allowing to express scientific problems with scientific concepts. This eliminates the need to explicitly provide lower-level machine-dependent code. Scientists still use the general-purpose language. The GGDML code is translated by a source-to-source translation tool that optimizes the generated code to a specific machine. The translation process is driven by configurations that are provided independently from the source code.  In this poster we review some GGDML extensions and we focus mainly on the configurable code translation of the higher-level code.},
}

@misc{ACAIMFESKL17,
	author	 = {Julian Kunkel and Thomas Ludwig and Thomas Dubos and Naoya Maruyama and Takayuki Aoki and Günther Zängl and Hisashi Yashiro and Ryuji Yoshida and Hirofumi Tomita and Masaki Satoh and Yann Meurdesoif and Nabeeh Jumah and Anastasiia Novikova},
	title	 = {{Advanced Computation and I/O Methods for Earth-System Simulations (AIMES)}},
	year	 = {2017},
	month	 = {06},
	location	 = {Germany, Frankfurt},
	activity	 = {ISC 2017},
	abstract	 = {The Advanced Computation and I/O Methods for Earth-System Simulations (AIMES) project addresses the key issues of programmability, computational efficiency and I/O limitations that are common in next-generation icosahedral earth-system models. Ultimately, the project is intended to foster development of best-practices and useful norms by cooperating on shared ideas and components. During the project, we ensure that the developed concepts and tools are not only applicable for earth-science but for other scientific domains as well.},
	url	 = {http://www.isc-hpc.com/isc17_ap/auftritt/daten/attachments/PP15_Jumah.pdf},
}

@misc{TPPFAACMWT17,
	author	 = {Nabeeh Jumah and Julian Kunkel and Günther Zängl and Hisashi Yashiro and Thomas Dubos and Yann Meurdesoif},
	title	 = {{Towards Performance Portability for Atmospheric and Climate Models with the GGDML DSL}},
	year	 = {2017},
	month	 = {06},
	location	 = {Germany, Frankfurt},
	activity	 = {ISC 2017},
	abstract	 = {Demand for high-performance computing is increasing in atmospheric and climate sciences, and in natural sciences in general. Unfortunately, automatic optimizations done by compilers are not enough to make use of target machines' capabilities. Manual code adjustments are mandatory to exploit hardware capabilities. However, optimizing for one architecture, may degrade performance for other architectures. This loss of portability is a challenge. With GGDML we examine an approach for icosahedral-grid based climate and atmospheric models, that is based on a domain-specific language (DSL) which fosters separation of concerns between domain scientists and computer scientists. Our DSL extends Fortran language with concepts from domain science, apart from any technical descriptions such as hardware based optimization. The approach aims to achieve high performance, portability and maintainability through a compilation infrastructure principally built upon configurations from computer scientists. Fortran code extended with novel semantics from the DSL goes through the meta-DSL based compilation procedure. This generates high performance code -aware of platform features, based on provided configurations. We show that our approach reduces code significantly (to 40\%) and improves readability for the models DYNAMICO, ICON and NICAM. We also show that the whole approach is viable in terms of performance portability, as it allows to generate platform-optimized code with minimal configuration changes. With a few lines, we are able to switch between two different memory representations during compilation and achieve double the performance. In addition, applying inlining and loop fusion yields 10 percent enhanced performance.},
	url	 = {http://isc-hpc.com/isc17_ap/auftritt/daten/attachments/RP19_Jumah.pdf},
}

@misc{PCHKKLROSH17,
	author	 = {Julian Kunkel and Michael Kuhn and Thomas Ludwig and Matthias Riebisch and Stephan Olbrich and Hinnerk Stüben and Kai Himstedt and Hendryk Bockelmann and Markus Stammberger},
	title	 = {{Performance Conscious HPC (PeCoH)}},
	year	 = {2017},
	month	 = {06},
	location	 = {Frankfurt, Germany},
	activity	 = {ISC High Performance 2017},
	url	 = {http://isc-hpc.com/isc17_ap/presentationdetails.htm?t=presentation&o=1196&a=select&ra=personendetails},
}

@misc{EACILFKKL17,
	author	 = {Anna Fuchs and Michael Kuhn and Julian Kunkel and Thomas Ludwig},
	title	 = {{Enhanced Adaptive Compression in Lustre}},
	year	 = {2017},
	month	 = {06},
	location	 = {Frankfurt, Germany},
	activity	 = {ISC High Performance 2017},
	url	 = {http://isc-hpc.com/isc17_ap/presentationdetails.htm?t=presentation&o=1144&a=select&ra=personendetails},
}

@misc{ISOCOTOCTA17,
	author	 = {Anja Gerbes and Julian Kunkel and Nabeeh Jumah},
	title	 = {{Intelligent Selection of Compiler Options to Optimize Compile Time and Performance}},
	year	 = {2017},
	month	 = {03},
	location	 = {Saarbrücken},
	activity	 = {Euro LLVM},
	abstract	 = {The efficiency of the optimization process during the compilation is crucial for the later execution behavior of the code. The achieved performance depends on the hardware architecture and the compiler's capabilities to extract this performance. Code optimization can be a CPU- and memory-intensive process which -- for large codes -- can lead to high compilation times during development. Optimization also influences the debuggability of the resulting binary; for example, by storing data in registers. During development, it would be interesting to compile files individually with appropriate flags that enable debugging and provide high (near-production) performance during the testing but with moderate compile times. We are exploring to create a tool to identify code regions that are candidates for higher optimization levels. We follow two different approaches to identify the most efficient code optimization: 1) compiling different files with different options by brute force; 2) using profilers to identify the relevant code regions that should be optimized. Since big projects comprise hundreds of files, brute force is not efficient. The problem in, e.g., climate applications is that codes have too many files to test them individually. Improving this strategy using a profiler, we can identify the time consuming regions (and files) and then repeatedly refine our selection. Then, the relevant files are evaluated with different compiler flags to determine a good compromise of the flags. Once the appropriate flags are determined, this information could be retained across builds and shared between users. In our poster, we motivate and demonstrate this strategy on a stencil code derived from climate applications. The experiments done throughout this work are carried out on a recent Intel Skylake (i7-6700 CPU @ 3.40GHz) machine. We compare performance of the compilers clang (version 3.9.1) and gcc (version 6.3.0) for various optimization flags and using profile guided optimization (PGO) with the traditional compile with instrumentation/run/compile phase and when using the perf tool for dynamic instrumentation. The results show that more time (2x) is spent for compiling code using higher optimization levels in general, though gcc takes a little less time in general than clang. Yet the performance of the application were comparable after compiling the whole code with O3 to that of applying O3 optimization to the right subset of files. Thus, the approach proves to be effective for repositories where compilation is analyzed to guide subsequent compilations. Based on these results, we are building a prototype tool that can be embedded into building systems that realizes the aforementioned strategies of brute-force testing and profile guided analysis of relevant compilation flags.},
	url	 = {http://llvm.org/devmtg/2017-03//2017/02/20/accepted-sessions.html#42},
}

@misc{FAAFUTGFLH17,
	author	 = {Christian Hovy and Julian Kunkel},
	title	 = {{FortranTestGenerator: Automatic and Flexible Unit Test Generation for Legacy HPC Code}},
	year	 = {2017},
	month	 = {06},
	location	 = {Frankfurt},
	activity	 = {ISC High Performance 2017},
	abstract	 = {Unit testing is an established practice in professional software development.
      However, in high-performance computing (HPC) with its scientific applications, it is not widely applied.
      Besides general problems regarding testing of scientific software, for many HPC applications the effort of creating small test cases with a consistent set of test data is high.
      We have created a tool called FortranTestGenerator to reduce the effort of creating unit tests for subroutines of an existing Fortran application.
      It is based on Capture \& Replay (C\&R), that is, it extracts data while running the original application and uses the extracted data as test input data.
      The tool automatically generates code for capturing the input data and a basic test driver which can be extended by the developer to a meaningful unit test.
      A static source code analysis is conducted, to reduce the number of captured variables.
      Code is generated based on flexibly customizable templates.
      Thus, both the capturing process and the unit tests can easily be integrated into an existing software ecosystem.},
}

@misc{ADPUSSTIOS16,
	author	 = {Julian Kunkel},
	title	 = {{Analyzing Data Properties using Statistical Sampling Techniques – Illustrated on Scientific File Formats and Compression Features}},
	year	 = {2016},
	month	 = {21},
	location	 = {Frankfurt},
	activity	 = {ISC High Performance 2016},
	abstract	 = {Understanding the characteristics of data stored in data centers helps computer scientists identifying the most suitable storage infrastructure to deal with these workloads. For example, knowing the relevance of file formats allows optimizing the relevant file formats but also helps in a procurement to define useful benchmarks. Existing studies that investigate performance improvements and techniques for data reduction such as deduplication and compression operate on a small set of data. Some of those studies claim the selected data is representative and scale their result to the scale of the data center. One hurdle of evaluate novel schemes on the complete data is the vast amount of data stored and, thus, the resources required to analyze the complete data set. Even if this would be feasible, the costs for running many of those experiments must be justified. This poster investigates stochastic sampling methods to compute and analyze quantities of interest on file numbers but also on the occupied storage space. It is demonstrated that scanning 1\\% of files and data volume is sufficient on DKRZ's supercomputer to obtain accurate results. This not only speeds up the analysis process but reduces costs of such studies significantly. Contributions of this poster are: 1) investigation of the inherent error when operating only on a subset of data, 2) presentation of methods that help future studies to mitigate this error and, 3) illustration of the approach with a study for scientific file types and compression},
}

@misc{PIIHUANNSK16,
	author	 = {Jan Fabian Schmid and Julian Kunkel},
	title	 = {{Predicting I/O-performance in HPC using Artificial Neural Networks}},
	year	 = {2016},
	month	 = {21},
	location	 = {Frankfurt},
	activity	 = {ISC High Performance 2015},
	abstract	 = {Tools are demanded that help users of HPC-facilities to implement efficient input/output (I/O) in their programs. It is difficult to find the best access parameters and patterns due to complex parallel storage systems. To develop tools which support the implementation of efficient I/O a computational model of the storage system is key. For single hard disk systems such a model can be derived analytically [1]; however, for the complex storage system of a super computer these models become too difficult to configure [2]. Therefore we searched for good predictors of I/O performance using a machine learning approach with artificial neural networks (ANNs). A hypothesis was then proposed: The I/O-path significantly influences the time needed to access a file. In our analysis we used ANNs with different input information for the prediction of access times. To use I/O-paths as input for the ANNs, we developed a method, which approximates the different I/O-paths the storage system used during a benchmark-test. This method utilizes error classes.},
}

@misc{ICKKLLTKS15,
	author	 = {Julian Kunkel and Thomas Ludwig and Jakob Lüttgau and Dion Timmermann and Christian Kautz and Volker Skwarek},
	title	 = {{Interaktiver C Kurs (ICP)}},
	year	 = {2015},
	month	 = {11},
	location	 = {Hamburg},
	activity	 = {Campus Innovation 2015},
	abstract	 = {Programmiersprachen bilden die Basis für die automatisierte Datenverarbeitung in der digitalen Welt. Obwohl die Grundkonzepte einfach zu verstehen sind, beherrscht nur ein geringer Anteil von Personen diese Werkzeuge. Die Gründe hierfür sind Defizite in der Ausbildung und die Einstiegsshürde bei der Bereitstellung einer produktiven Programmierumgebung. Insbesondere erfordert das Erlernen einer Programmiersprache die praktische Anwendung der Sprache. Eine Integration von Programmierkursen in die Hamburg Open Online University verbessert nicht nur das Angebot für Studierende, sondern erschließt auch Fachfremden den Zugang zur Informatik.},
	url	 = {http://wr.informatik.uni-hamburg.de/_media/research/projects/icp/hoou-2016-poster.pdf},
}

@misc{UMLTPTPONI15,
	author	 = {Julian Kunkel and Michaela Zimmer and Eugen Betke},
	title	 = {{Using Machine Learning to Predict the Performance of Non-Contiguous I/O}},
	year	 = {2015},
	month	 = {07},
	location	 = {Frankfurt, Germany},
	url	 = {http://www.isc-hpc.com/research-posters.html},
}

@misc{ADSFNIZK15,
	author	 = {Enno Zickler and Julian Kunkel},
	title	 = {{Advanced Data Sieving for Non-Contigouous I/O}},
	year	 = {2015},
	month	 = {07},
	location	 = {Frankfurt, Germany},
	url	 = {http://www.isc-hpc.com/research-posters.html},
}

@misc{SAIFMAOOHK14,
	author	 = {Julian Kunkel and Michaela Zimmer and Marc Wiedemann and Nathanael Hübbe and Alvaro Aguilera and Holger Mickler and Xuan Wang and Andrij Chut and Thomas Bönisch},
	title	 = {{SIOX: An Infrastructure for Monitoring and Optimization of HPC-I/O}},
	year	 = {2014},
	month	 = {06},
	location	 = {ISC'14 Leipzig},
	abstract	 = {Performance analysis and optimization of high-performance I/O systems is a daunting task. Mainly, this is due to the overwhelmingly complex interplay of the involved hardware and software layers. The Scalable I/O for Extreme Performance (SIOX) project provides a versatile environment for monitoring I/O activities and learning from this information. The goal of SIOX is to automatically suggest and apply performance optimizations, and to assist in locating and diagnosing performance problems.
In this poster, we present the current status of SIOX. Our modular architecture covers instrumentation of POSIX, MPI and other high-level I/O libraries; the monitoring data is recorded asynchronously into a global database, and recorded traces can be visualized. Furthermore, we offer a set of primitive plug-ins with additional features to demonstrate the flexibility of our architecture: A surveyor plug-in to keep track of the oberserved spatial access patterns; an fadvise plug-in for injecting hints to achieve read-ahead for strided access patterns; and an optimizer plug-in which monitors the performance achieved with different MPI-IO hints, automatically supplying the best known hint-set when no hints were explicitely set. The presentation of the technical status is accompanied by a demonstration of some of these features on our 20 node cluster. In additional experiments, we analyze the overhead for concurrent access, for MPI-IO's 4-levels of access, and for an instrumented climate application.
While our prototype is not yet full-featured, it demonstrates the potential and feasability of our approach.},
	url	 = {http://www.isc-events.com/isc14_ap/presentationdetails.htm},
}

@misc{STFCMTLK13,
	author	 = {Raul Torres and Leonidas Lindarkis and Julian Kunkel},
	title	 = {{Source-to-Source Translation for Climate Models}},
	year	 = {2013},
	month	 = {06},
	location	 = {Leipzig, Germany},
	activity	 = {International Supercomputing Conference 2013},
	url	 = {http://www.isc-events.com/isc13_ap/presentationdetails.php?t=contribution&o=2117&a=select&ra=eventdetails},
}

@misc{BAIITCKMRK10,
	author	 = {Julian Kunkel and Olga Mordvinova and Dennis Runz and Michael Kuhn and Thomas Ludwig},
	title	 = {{Benchmarking Application I/O in the Community}},
	year	 = {2010},
	month	 = {06},
	location	 = {Hamburg, Germany},
	activity	 = {International Supercomputing Conference},
	url	 = {http://www.supercomp.de/isc10/itinerary/search&type=10},
}

@misc{SOCPCAEMKL10,
	author	 = {Timo Minartz and Julian Kunkel and Thomas Ludwig},
	title	 = {{Simulation of Cluster Power Consumption and Energy-to-Solution}},
	year	 = {2010},
	month	 = {04},
	location	 = {Passau, Germany},
	activity	 = {International Conference on Energy-Efficient Computing and Networking},
	url	 = {http://www.e-energy.uni-passau.de/nc/program/technical-program/poster-session.html?cid=128&did=132&sechash=f09d1ce7},
}

@misc{DSAPFHTRSK09,
	author	 = {Julian Kunkel and Thomas Ludwig and M. Hemberger and G. Torralba and E. Schmitt and M. Hausmann and V. Lindenstruth and N. Brown and R. Schneider},
	title	 = {{Data Storage and Processing for High Throughput RNAi Screening}},
	year	 = {2009},
	location	 = {Heidelberg, Germany},
	activity	 = {German Symposium on Systems Biology 2009},
}

@techreport{ABFTNSRAFH19,
	author	 = {Gabriel Antoniu and Marc Asch and Peter Bauer and Costas Bekas and Pascale Bernier-Bruna and Francois Bodin and Laurent Cargemel and Paul Carpenter and Marc Duranton and Maike Gilliot and Hans-Christian Hoppe and Jens Krueger and Julian Kunkel and Erwin Laure and Jean-Francois Lavignon and Guy Lonsdale and Michael Malms and Fabio Martinelli and Sai Narasimhamurthy and Marcin Ostasz and Maria Perez and Dirk Pleiter and Andrea Reale and Pascale Rosse-Laurent},
	title	 = {{A blueprint for the new Strategic Research Agenda for High Performance Computing}},
	year	 = {2019},
	month	 = {04},
	publisher	 = {ETP4HPC, EXDCI},
	url	 = {https://www.etp4hpc.eu/hpc-vision-018.html},
}

@techreport{WRSCABCFGJ17,
	author	 = {Yevhen Alforov and Eugen Betke and Konstantinos Chasapis and Anna Fuchs and Fabian Große and Nabeeh Jumah and Michael Kuhn and Julian Kunkel and Hermann Lenhart and Jakob Lüttgau and Philipp Neumann and Anastasiia Novikova and Jannek Squar and Thomas Ludwig},
	title	 = {{Wissenschaftliches Rechnen - Scientific Computing - 2016}},
	year	 = {2017},
	month	 = {06},
	publisher	 = {Research Group: Scientific Computing, University of Hamburg},
	address	 = {Deutsches Klimarechenzentrum GmbH, Bundesstraße 45a, D-20146 Hamburg},
}

@techreport{SATFLSAOCC17,
	author	 = {Julian Kunkel},
	title	 = {{SFS: A Tool for Large Scale Analysis of Compression Characteristics}},
	year	 = {2017},
	month	 = {05},
	publisher	 = {Research Group: Scientific Computing, University of Hamburg},
	address	 = {Deutsches Klimarechenzentrum GmbH, Bundesstraße 45a, D-20146 Hamburg},
	series	 = {Research Papers},
	number	 = {4},
	abstract	 = {Data centers manage Petabytes of storage. Identifying the a fast lossless compression algorithm that is enabled on the storage system  that potentially reduce data by additional 10\% is significant. However, it is not trivial to evaluate algorithms on huge data pools as this evaluation requires running the algorithms and, thus, is costly, too. Therefore, there is the need for tools to optimize such an analysis. In this paper, the open source tool SFS is described that perform these scans efficiently. While based on an existing open source tool, SFS builds on a proven method to scan huge quantities of data using sampling from statistic. Additionally, we present results of 162 variants of various algorithms conducted on three data pools with scientific data and one more general purpose data pool. Based on this analysis promising classes of algorithms are identified.},
}

@techreport{SGDUELDKKL15,
	author	 = {Thomas Ludwig and Manuel Dolz and Michael Kuhn and Julian Kunkel and Hermann Lenhart},
	title	 = {{Speicherung großer Datenmengen und Energieeffizienz}},
	year	 = {2015},
	publisher	 = {Max-Planck-Gesselschaft},
	address	 = {München},
	url	 = {http://www.mpg.de/8862100/JB_2015},
}

@techreport{IRFITIUSMK15,
	author	 = {Julian Kunkel},
	title	 = {{Identifying Relevant Factors in the I/O-Path using Statistical Methods}},
	year	 = {2015},
	month	 = {03},
	publisher	 = {Research Group: Scientific Computing, University of Hamburg},
	address	 = {Deutsches Klimarechenzentrum GmbH, Bundesstraße 45a, D-20146 Hamburg},
	series	 = {Research Papers},
	number	 = {3},
	abstract	 = {File systems of supercomputers are complex systems of hardware and software. They utilize many optimization techniques such as the cache hierarchy to speed up data access. Unfortunately, this complexity makes assessing I/O difficult. It is impossible to predict the performance of a single I/O operation without knowing the exact system state, as optimizations such as client-side caching of the parallel file system may speed up performance significantly. I/O tracing and characterization tools help capturing the application workload and quantitatively assessing the performance. However, a user has to decide himself if obtained performance is acceptable. In this paper, a density-based method from statistics is investigated to build a model which assists administrators to identify relevant causes (a performance factor). Additionally, the model can be applied to purge unexpectedly slow operations that are caused by significant congestion on a shared resource. It will be sketched, how this could be used in the long term to automatically assess performance and identify the likely cause. The main contribution of the paper is the presentation of a novel methodology to identify relevant performance factors by inspecting the observed execution time on the client side. Starting from a black box model, the methodology is applicable without fully understanding all hardware and software components of the complex system. It then guides the analysis from observations and fosters identification of the most significant performance factors in the I/O path. To evaluate the approach, a model is trained on DKRZ's supercomputer Mistral and validated on synthetic benchmarks. It is demonstrated that the methodology is currently able to distinguish between several client-side storage cases such as sequential and random memory layout, and cached or uncached data, but this will be extended in the future to include server-side I/O factors as well.},
}

@techreport{WEEIBCFKN14,
	author	 = {Andre Brinkmann and Toni Cortes and Hugo Falter and Julian Kunkel and Sai Narasimhamurthy},
	title	 = {{Whitepaper: E10 -- Exascale IO}},
	year	 = {2014},
	month	 = {06},
	url	 = {http://www.exascale10.com/},
}

@techreport{HATASEOAAS11,
	author	 = {Julian Kunkel},
	title	 = {{HDTrace – A Tracing and Simulation Environment of Application and System Interaction}},
	year	 = {2011},
	month	 = {01},
	publisher	 = {Research Group: Scientific Computing, University of Hamburg},
	address	 = {Deutsches Klimarechenzentrum GmbH, Bundesstraße 45a, D-20146 Hamburg},
	series	 = {Research Papers},
	number	 = {2},
	abstract	 = {HDTrace is an environment which allows to trace and simulate the behavior of MPI programs on a cluster.
It explicitly includes support to trace internals of MPICH2 and the parallel file system PVFS.
With this support it enables to localize inefficiencies, to conduct research on new algorithms and to evaluate future systems.
Simulation provides upper bounds of expected performance and helps to assess observed performance as potential performance gains of optimizations can be approximated.

In this paper the environment is introduced and several examples depict how it assists to reveal internal behavior and spot bottlenecks.
In an example with PVFS the inefficient write-out of a matrix diagonal could be either identified by inspecting the PVFS server behavior or by simulation.
Additionally the simulation showed that in theory the operation should finish 20 times faster on our cluster -- by applying correct MPI hints this potential could be exploited.},
}

@techreport{SPCOSOWTCH10,
	author	 = {Julian Kunkel and Petra Nerge},
	title	 = {{System Performance Comparison of Stencil Operations with the Convey HC-1}},
	year	 = {2010},
	month	 = {11},
	publisher	 = {Research Group: Scientific Computing, University of Hamburg},
	address	 = {Deutsches Klimarechenzentrum GmbH, Bundesstraße 45a, D-20146 Hamburg},
	series	 = {Technical Reports},
	number	 = {1},
	abstract	 = {In this technical report our first experiences with a Convey HC-1 are documented.
Several stencil application kernels are evaluated and related work in the area of CPUs, GPUs and FPGAs is discussed.
Performance of the C and Fortran stencil benchmarks in single and double precision are reported.
Benchmarks were run on Blizzard -- the IBM supercomputer at DKRZ --, the working group's Intel Westmere cluster and the Convey HC-1 provided at KIT.

With the Vector personality, performance of the Convey system is not convincing.
However, there lies potential in programming custom personalities.
The major issue is to approximate performance of an implementation on a FPGA before the time consuming implementation is performed.},
	url	 = {http://www.wr.informatik.uni-hamburg.de/_media/research/publications/2010/spcosowtch10-system_performance_comparison_of_stencil_operations_with_the_convey_hc_1.pdf},
}

@techreport{CONCBODOIR10,
	author	 = {Julian Kunkel and Jan C. Neddermeyer and Thomas Ludwig},
	title	 = {{Classification of Network Computers Based on Distribution of ICMP-echo Round-trip Times}},
	year	 = {2010},
	month	 = {09},
	publisher	 = {Staats- und Universitätsbibliothek Hamburg},
	address	 = {Carl von Ossietzky, Von-Melle-Park 3, 20146 Hamburg},
	series	 = {Research Papers},
	number	 = {1},
	abstract	 = {Classification of network hosts into groups of similar hosts allows an attacker to transfer knowledge gathered from one host of a group to others.
In this paper we demonstrate that it is possible to classify hosts by inspecting the distributions of the response times from ICMP echo requests. In particular, it is shown that the response time of a host is like a fingerprint covering components inside the network, the host software as well as some hardware aspects of the target.

This allows to identify nodes consisting of similar hardware and OS. Instances of virtual machines hosted on a single physical hardware can be detected in the same way. To understand the influence of hardware and software components a simple model is built and the quantitative contribution of each component to the round-trip time is briefly evaluated.

Several experiments show the successful application of the classifier inside an Ethernet LAN and over the Internet.},
	url	 = {http://epub.sub.uni-hamburg.de/informatik/volltexte/2010/152/pdf/Classification_of_Network_Computers_Based_on_Distributions_of_ICMP_Echo_Round_Trip_Times.pdf},
}