hw1_dry.lyx

#LyX 2.3 created this file. For more info see http://www.lyx.org/
\lyxformat 544
\begin_document
\begin_header
\save_transient_properties true
\origin unavailable
\textclass extreport
\begin_preamble
\usepackage{tikz}
\usepackage{enumitem}

\usetikzlibrary{patterns}
\usetikzlibrary{positioning}
\usetikzlibrary{snakes}
\usetikzlibrary{calc}
\usetikzlibrary{arrows}

\tikzset{bold/.style={draw,circle, thick, double, minimum size=20, fill=gray!20}}
\tikzset{reg/.style={draw,circle,thick, minimum size=20, fill=gray!20}}
\tikzset{regp/.style={shorten <= 2, shorten >= 2, thick}}
\tikzset{snaky/.style={decorate, decoration = {snake, post length=2mm, amplitude=1.5mm, segment length=5mm}}}
\tikzset{bracy/.style={decorate, decoration = {brace, aspect = 0.5, amplitude = 3mm}}}
\tikzset{bracymirror/.style={decorate, decoration = {brace, aspect = 0.5, amplitude = 3mm, mirror}}}

\renewcommand\labelitemi{$\bullet$}
\newcommand{\bbar}{b\hspace*{-0.2em}\bar{}\hspace*{0.1em}}

\setenumerate{label=\alph{enumi}.}
\setenumerate[2]{label=\arabic{enumii})}
\setenumerate[3]{label=.\roman{enumiii}}
\end_preamble
\use_default_options true
\begin_modules
enumitem
theorems-ams-bytype
theorems-ams-extended-bytype
\end_modules
\maintain_unincluded_children false
\begin_local_layout
Style Itemize
  ItemSep 0
  ParSep 0
End
Style Enumerate
  ItemSep 0
  ParSep 0
End
\end_local_layout
\language american
\language_package default
\inputencoding auto
\fontencoding global
\font_roman "default" "David CLM"
\font_sans "default" "Arimo"
\font_typewriter "default" "default"
\font_math "auto" "auto"
\font_default_family default
\use_non_tex_fonts true
\font_sc false
\font_osf false
\font_sf_scale 95 95
\font_tt_scale 100 100
\use_microtype false
\use_dash_ligatures true
\graphics default
\default_output_format pdf4
\output_sync 0
\bibtex_command default
\index_command default
\float_placement H
\paperfontsize default
\spacing single
\use_hyperref false
\papersize default
\use_geometry true
\use_package amsmath 1
\use_package amssymb 1
\use_package cancel 1
\use_package esint 1
\use_package mathdots 1
\use_package mathtools 1
\use_package mhchem 1
\use_package stackrel 1
\use_package stmaryrd 1
\use_package undertilde 1
\cite_engine basic
\cite_engine_type default
\biblio_style plain
\use_bibtopic false
\use_indices false
\paperorientation portrait
\suppress_date false
\justification true
\use_refstyle 1
\use_minted 0
\index Index
\shortcut idx
\color #008000
\end_index
\leftmargin 2cm
\topmargin 2cm
\rightmargin 2cm
\bottommargin 2cm
\secnumdepth -2
\tocdepth 2
\paragraph_separation skip
\defskip smallskip
\is_math_indent 0
\math_numbering_side default
\quotes_style english
\dynamic_quotes 0
\papercolumns 1
\papersides 1
\paperpagestyle default
\tracking_changes false
\output_changes false
\html_math_output 0
\html_css_as_file 0
\html_be_strict false
\end_header

\begin_body

\begin_layout Title
HW1 - Accelerators and Accelerated Systems
\end_layout

\begin_layout Author
Shay Agroskin
\begin_inset Newline newline
\end_inset

Tomer Abraham 316219849
\end_layout

\begin_layout Section
Cuda environment overview
\end_layout

\begin_layout Itemize
Cuda version
\end_layout

\begin_deeper
\begin_layout Standard
\begin_inset listings
inline false
status open

\begin_layout Plain Layout

# nvcc --version
\end_layout

\begin_layout Plain Layout

nvcc: NVIDIA (R) Cuda compiler driver
\end_layout

\begin_layout Plain Layout

Copyright (c) 2005-2015 NVIDIA Corporation
\end_layout

\begin_layout Plain Layout

Built on Mon_Feb_16_22:59:02_CST_2015
\end_layout

\begin_layout Plain Layout

Cuda compilation tools, release 7.0, V7.0.27
\end_layout

\end_inset


\end_layout

\end_deeper
\begin_layout Itemize
GPU name: GeForce GTX 780
\end_layout

\begin_layout Itemize
ADD #SM here
\end_layout

\begin_layout Section
GPU serial version
\end_layout

\begin_layout Enumerate
Why is atmicAdd is required?
\begin_inset Newline newline
\end_inset

A: Multiple thread can access the same cell in the histogram: if two cells
 have a value of 200, than both of them will access the 200th cell in the
 histogram array.
 To synchronize between them we use atomicAdd
\end_layout

\begin_layout Enumerate
How many thread did you use and why?
\begin_inset Newline newline
\end_inset

A: We used 256 threads so that every thread will work on a single cell in
 the histogram.
 If we'd use more, we'd have to constrain the number of threads with 'if'
 condition and thus creating unnecessary divergence.
\end_layout

\begin_layout Enumerate
What is the total time run time and the thoughput 
\begin_inset Formula $\left(\frac{images}{sec}\right)$
\end_inset

 ?
\begin_inset Newline newline
\end_inset

A: We ran the tests on 500 images.
\begin_inset Newline newline
\end_inset

Total run time: 
\begin_inset Formula $193.53\,msec$
\end_inset


\begin_inset Newline newline
\end_inset

Throughput: 
\begin_inset Formula $\frac{500}{193.53}=2.5835\,\frac{images}{sec}$
\end_inset


\end_layout

\begin_layout Enumerate
Show a clear screenshot showing the execution of at least two kernel function
 execution and their respective memory movement.
\end_layout

\begin_deeper
\begin_layout Standard
\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
	filename p_3.7.png
	scale 50

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Caption Standard

\begin_layout Plain Layout
kernel invocation with memory movement, serial execution
\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout

\end_layout

\end_inset


\end_layout

\end_deeper
\begin_layout Enumerate
Choose one 'memcpy' from CPU to GPU, and report its time
\end_layout

\begin_deeper
\begin_layout Standard
\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
	filename p_3.8.png
	scale 50

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Caption Standard

\begin_layout Plain Layout
memory copy from CPU to GPU, serial execution
\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout

\end_layout

\end_inset


\end_layout

\begin_layout Standard
As we can see, the execution time of memory movement from CPU to GPU takes
 
\begin_inset Formula $59.265nsec$
\end_inset

 on the CPU side and 
\begin_inset Formula $13.345nsec$
\end_inset

 on the account of 
\begin_inset Formula $GPU$
\end_inset

 (we can deduct that the CPU is the one that makes the memory transfer,
 and the 
\begin_inset Formula $GPU$
\end_inset

 only handles the synching).
\end_layout

\end_deeper
\begin_layout Section
GPU bulk section
\end_layout

\begin_layout Enumerate
What is the total execution time and the speedup compared to the serial
 version?
\begin_inset Newline newline
\end_inset

A: Total execution time is 
\begin_inset Formula $13.33ms$
\end_inset

 which is speedup of 
\begin_inset Formula $\frac{190.99ms}{13.33ms}=14.327$
\end_inset

 compared to the serial version
\end_layout

\begin_layout Enumerate
Attach a clear screenshot of the execution of the bulk section fron the
 NVIDIA profiler
\begin_inset Newline newline
\end_inset


\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
	filename p_4.5.png
	width 80line%
	height 6cm

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Caption Standard

\begin_layout Plain Layout
Profiling of the bulk execution
\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout

\end_layout

\end_inset


\end_layout

\begin_layout Enumerate
Report the time a CPU to GPU 'memcpy' takes.
 Compare it to the time measured in the serial implementation.
 Does the time grows linearly with the size of the data being copied ?
\begin_inset Newline newline
\end_inset


\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
	filename p_4.6.png
	scale 50

\end_inset


\begin_inset Caption Standard

\begin_layout Plain Layout
memory copy from CPU to GPU, bulk excution
\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout

\end_layout

\end_inset


\end_layout

\begin_deeper
\begin_layout Standard
It takes 
\begin_inset Formula $5.265msec$
\end_inset

 and 
\begin_inset Formula $5.227msec$
\end_inset

 on the CPU and GPU side respectively which is 
\begin_inset Formula $\frac{5.265msec}{59.265nsec}=88.84$
\end_inset

 and 
\begin_inset Formula $\frac{5.227msec}{13.345nsec}=391.68$
\end_inset

 times the time it took for a single memory copy in the serial implementation.
 Since we copy 500 times more pictures, it's clear that the time doesn't
 grow linearly.
\end_layout

\end_deeper
\end_body
\end_document