% Use this when submitting an article to a sponsored event. You'll receive a unique submission ID from the organizers
% of the event, and this ID should be used as the parameter to this command.
%\acmSubmissionID{123-A56-BU3}
%
% The majority of ACM publications use numbered citations and references. If you are preparing content for an event
% sponsored by ACM SIGGRAPH, you must use the "author year" style of citations and references. Uncommenting
% the next command will enable that style.
%\citestyle{acmauthoryear}
%
% end of the preamble, start of the body of the document source.
\begin{document}
%
% The "title" command has an optional parameter, allowing the author to define a "short title" to be used in page headers.
\title{ML Training Pipeline in HPC: A Use Case}
%
% The "author" command and its associated commands are used to define the authors and their affiliations.
% Of note is the shared affiliation of the first two authors, and the "authornote" and "authornotemark" commands
% used to denote shared contribution to the research.
\author{Aaron Saxton}
\email{saxton@illinois.edu}
\affiliation{%
\institution{University of Illinois}
\institution{National Center For Super Computing Applications}
\institution{Blue Waters Project Office}
\city{Urbana}
\state{Illinois}
}
%
% By default, the full list of authors will be used in the page headers. Often, this list is too long, and will overlap
% other information printed in the page headers. This command allows the author to define a more concise list
% of authors' names for this purpose.
\renewcommand{\shortauthors}{Saxton}
%
% The abstract is a short summary of the work to be presented in the article.
\begin{abstract}
Developing ML algorithms is as much about the data as it is the model. The success of "ResNet 50 in 15 min"\cite{akiba2017extremely} showed that it's possible to scale model training, but ImageNet was major contributing factor to its success having been carefully curated. Frameworks like Tensorflow, PyTorch, and Flux simplified many aspects of model design and left the data curation and wrangling up to the practitioner. As a result, applications new novel of ML to original dataset often struggle achieving the same scalability as "ResNet 50 in 15 min"\cite{akiba2017extremely}. One hinderance is scaleable and queryable access to truly large datasets. In this paper we describe a HPC workflow that has allowed the Blue Waters team to develop ML models on large and previously un-curated data.
\end{abstract}
%
% The code below is generated by the tool at http://dl.acm.org/ccs.cfm.
% Please copy and paste the code instead of the example below.
% Keywords. The author(s) should pick words that accurately describe the work being
% presented. Separate the keywords with commas.
%\keywords{distributed datastore, Mongodb, high performentce computing, shard filesystem}
%
% This command processes the author and affiliation and title information and builds
% the first part of the formatted document.
\maketitle
\input{HPC_AI_Pipeline_Introduction.tex}
%
% The acknowledgments section is defined using the "acks" environment (and NOT an unnumbered section). This ensures
% the proper identification of the section in the article metadata, and the consistent spelling of the heading.
\begin{acks}
This research is part of the Blue Waters sustained-petascale computing project, which is supported by the National Science Foundation (awards OCI-0725070 and ACI-1238993) and the state of Illinois. Blue Waters is a joint effort of the University of Illinois at Urbana-Champaign and its National Center for Supercomputing Applications.