Skip to content

Commit

Permalink
Adding PyProf to Apex (NVIDIA#404)
Browse files Browse the repository at this point in the history
Co-authored-by: Aditya Agrawal <aditya.iitb@gmail.com>
Co-authored-by: Marek Kolodziej <mkolod@gmail.com>
  • Loading branch information
2 people authored and mcarilli committed Aug 13, 2019
1 parent 4a8c4ac commit 880ab92
Show file tree
Hide file tree
Showing 67 changed files with 5,987 additions and 1 deletion.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,11 @@ A Python-only build omits:
- Fused kernels that improve the performance of `apex.parallel.DistributedDataParallel` and `apex.amp`.
`DistributedDataParallel`, `amp`, and `SyncBatchNorm` will still be usable, but they may be slower.

To enable PyProf support, you need to install the packages required by PyProf. To do so, add the "--pyprof" option at installation time:
```
$ pip install -v --no-cache-dir --global-option="--pyprof" --global-option="--cpp_ext" --global-option="--cuda_ext" ./
```

### Windows support
Windows support is experimental, and Linux is recommended. `pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .` may work if you were able to build Pytorch from source
on your system. `pip install -v --no-cache-dir .` (without CUDA/C++ extensions) is more likely to work. If you installed Pytorch in a Conda environment, make sure to install Apex in that same environment.
2 changes: 2 additions & 0 deletions apex/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# May help avoid undefined symbol errors https://pytorch.org/cppdocs/notes/faq.html#undefined-symbol-errors-from-pytorch-aten
import torch
import warnings

from . import parallel
from . import amp
Expand All @@ -14,3 +15,4 @@
# load time) the error message is timely and visible.
from . import optimizers
from . import normalization
from . import pyprof
21 changes: 21 additions & 0 deletions apex/pyprof/FAQs.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
1. How do I intercept the Adam optimizer in APEX ?

```python
from apex import pyprof
import fused_adam_cuda
pyprof.nvtx.wrap(fused_adam_cuda, 'adam')
```

2. If you are using JIT and/or AMP, the correct initialization sequence is
1. Let any JIT to finish.
2. Initlialize pyprof `pyprof.nvtx.init()`.
3. Initialize AMP.

3. How do I profile with `torch.distributed.launch` ?

```python
nvprof -f -o net%p.sql \
--profile-from-start off \
--profile-child-processes \
python -m torch.distributed.launch net.py
```
252 changes: 252 additions & 0 deletions apex/pyprof/README.md

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions apex/pyprof/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import warnings

from . import nvtx
4 changes: 4 additions & 0 deletions apex/pyprof/examples/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
__pycache__
*.sql
*.dict
*.csv
1 change: 1 addition & 0 deletions apex/pyprof/examples/apex/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This directory has examples of how to use `pyprof` with APEX extensions e.g. `fused_adam_cuda` and `fused_layer_norm_cuda`.
20 changes: 20 additions & 0 deletions apex/pyprof/examples/apex/fused_adam.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import torch
import fused_adam_cuda
from apex.optimizers import FusedAdam, FP16_Optimizer
from apex import pyprof

pyprof.nvtx.init()
pyprof.nvtx.wrap(fused_adam_cuda, 'adam')

model = torch.nn.Linear(10, 20).cuda().half()
criterion = torch.nn.CrossEntropyLoss().cuda()
optimizer = FusedAdam(model.parameters())
optimizer = FP16_Optimizer(optimizer)

x = torch.ones(32, 10).cuda().half()
target = torch.empty(32, dtype=torch.long).random_(20).cuda()
y = model(x)
loss = criterion(y, target)
optimizer.zero_grad()
loss.backward()
optimizer.step()
28 changes: 28 additions & 0 deletions apex/pyprof/examples/apex/fused_layer_norm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import torch
import fused_layer_norm_cuda
from apex.normalization import FusedLayerNorm
from apex import pyprof

pyprof.nvtx.init()
pyprof.nvtx.wrap(fused_layer_norm_cuda, 'forward')
pyprof.nvtx.wrap(fused_layer_norm_cuda, 'backward')
pyprof.nvtx.wrap(fused_layer_norm_cuda, 'forward_affine')
pyprof.nvtx.wrap(fused_layer_norm_cuda, 'backward_affine')

input = torch.randn(20, 5, 10, 10).cuda()

# With Learnable Parameters
m = FusedLayerNorm(input.size()[1:]).cuda()
output = m(input)

# Without Learnable Parameters
m = FusedLayerNorm(input.size()[1:], elementwise_affine=False).cuda()
output = m(input)

# Normalize over last two dimensions
m = FusedLayerNorm([10, 10]).cuda()
output = m(input)

# Normalize over last dimension of size 10
m = FusedLayerNorm(10).cuda()
output = m(input)
30 changes: 30 additions & 0 deletions apex/pyprof/examples/apex/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash

set -e

SCRIPT=`realpath $0`
SCRIPTPATH=`dirname $SCRIPT`
PYPROF="$SCRIPTPATH/../.."

parse="python $PYPROF/parse/parse.py"
prof="python $PYPROF/prof/prof.py"

for f in *.py
do
base=`basename $f .py`
sql=$base.sql
dict=$base.dict

#NVprof
echo "nvprof -fo $sql python $f"
nvprof -fo $sql python $f

#Parse
echo $parse $sql
$parse $sql > $dict

#Prof
echo $prof $dict
$prof -w 130 $dict
\rm $sql $dict
done
1 change: 1 addition & 0 deletions apex/pyprof/examples/custom_func_module/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This directory has examples which show how to intercept (monkey patch) custom functions and modules with `pyprof`. No changes are required in `pyprof/parse`, however, users can add support for bytes and flops calculation for custom functions and modules in `pyprof/prof` by extending the `OperatorLayerBase` class.
33 changes: 33 additions & 0 deletions apex/pyprof/examples/custom_func_module/custom_function.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/usr/bin/env python3

import torch
import torch.cuda.profiler as profiler
from apex import pyprof
#Initialize pyprof
pyprof.nvtx.init()

class Foo(torch.autograd.Function):
@staticmethod
def forward(ctx, in1, in2):
out = in1 + in2 #This could be a custom C/C++ function.
return out

@staticmethod
def backward(ctx, grad):
in1_grad = grad #This could be a custom C/C++ function.
in2_grad = grad #This could be a custom C/C++ function.
return in1_grad, in2_grad

#Hook the forward and backward functions to pyprof
pyprof.nvtx.wrap(Foo, 'forward')
pyprof.nvtx.wrap(Foo, 'backward')

foo = Foo.apply

x = torch.ones(4,4).cuda()
y = torch.ones(4,4).cuda()

with torch.autograd.profiler.emit_nvtx():
profiler.start()
z = foo(x,y)
profiler.stop()
27 changes: 27 additions & 0 deletions apex/pyprof/examples/custom_func_module/custom_module.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/usr/bin/env python3

import torch
import torch.cuda.profiler as profiler
from apex import pyprof
pyprof.nvtx.init()

class Foo(torch.nn.Module):
def __init__(self, size):
super(Foo, self).__init__()
self.n = torch.nn.Parameter(torch.ones(size))
self.m = torch.nn.Parameter(torch.ones(size))

def forward(self, input):
return self.n*input + self.m

#Hook the forward function to pyprof
pyprof.nvtx.wrap(Foo, 'forward')

foo = Foo(4)
foo.cuda()
x = torch.ones(4).cuda()

with torch.autograd.profiler.emit_nvtx():
profiler.start()
z = foo(x)
profiler.stop()
30 changes: 30 additions & 0 deletions apex/pyprof/examples/custom_func_module/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash

set -e

SCRIPT=`realpath $0`
SCRIPTPATH=`dirname $SCRIPT`
PYPROF="$SCRIPTPATH/../.."

parse="python $PYPROF/parse/parse.py"
prof="python $PYPROF/prof/prof.py"

for f in *.py
do
base=`basename $f .py`
sql=$base.sql
dict=$base.dict

#NVprof
echo "nvprof -fo $sql python $f"
nvprof -fo $sql python $f

#Parse
echo $parse $sql
$parse $sql > $dict

#Prof
echo $prof $dict
$prof -w 130 $dict
\rm $sql $dict
done
137 changes: 137 additions & 0 deletions apex/pyprof/examples/imagenet/imagenet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
#!/usr/bin/env python3

"""
Example to run pyprof with imagenet models.
"""

import sys
import torch
import torch.nn as nn
import torchvision.models as models
import torch.cuda.profiler as profiler
import argparse

from apex import pyprof
from apex.optimizers import FusedAdam, FP16_Optimizer
import fused_adam_cuda

def parseArgs():
parser = argparse.ArgumentParser(prog=sys.argv[0], description="Run popular imagenet models.")

parser.add_argument("-m",
type=str,
default="resnet50",
choices=["alexnet", "densenet121", "densenet161", "densenet169", "densenet201", "googlenet", "mnasnet0_5", "mnasnet0_75", "mnasnet1_0", "mnasnet1_3", "mobilenet_v2", "resnet18", "resnet34", "resnet50", "resnet101", "resnet152", "resnext50_32x4d", "resnext101_32x8d", "wide_resnet50_2", "wide_resnet101_2", "shufflenet_v2_x0_5", "shufflenet_v2_x1_0", "shufflenet_v2_x1_5", "shufflenet_v2_x2_0", "squeezenet1_0", "squeezenet1_1", "vgg11", "vgg11_bn", "vgg13", "vgg13_bn", "vgg16", "vgg16_bn", "vgg19", "vgg19_bn", "inception_v3"],
help="Model.")

parser.add_argument("-b",
type=int,
default=32,
help="Batch size.")

parser.add_argument("-o",
type=str,
default="adam",
choices=["adam", "sgd"],
help="Optimizer.")

args = parser.parse_args()
return args

d = {
"alexnet": {'H': 224, 'W': 224, 'opts': {}},

"densenet121": {'H': 224, 'W': 224, 'opts': {}},
"densenet161": {'H': 224, 'W': 224, 'opts': {}},
"densenet169": {'H': 224, 'W': 224, 'opts': {}},
"densenet201": {'H': 224, 'W': 224, 'opts': {}},

"googlenet": {'H': 224, 'W': 224, 'opts': {'aux_logits': False}},

"mnasnet0_5": {'H': 224, 'W': 224, 'opts': {}},
"mnasnet0_75": {'H': 224, 'W': 224, 'opts': {}},
"mnasnet1_0": {'H': 224, 'W': 224, 'opts': {}},
"mnasnet1_3": {'H': 224, 'W': 224, 'opts': {}},

"mobilenet_v2": {'H': 224, 'W': 224, 'opts': {}},

"resnet18": {'H': 224, 'W': 224, 'opts': {}},
"resnet34": {'H': 224, 'W': 224, 'opts': {}},
"resnet50": {'H': 224, 'W': 224, 'opts': {}},
"resnet101": {'H': 224, 'W': 224, 'opts': {}},
"resnet152": {'H': 224, 'W': 224, 'opts': {}},

"resnext50_32x4d": {'H': 224, 'W': 224, 'opts': {}},
"resnext101_32x8d": {'H': 224, 'W': 224, 'opts': {}},

"wide_resnet50_2": {'H': 224, 'W': 224, 'opts': {}},
"wide_resnet101_2": {'H': 224, 'W': 224, 'opts': {}},

"shufflenet_v2_x0_5": {'H': 224, 'W': 224, 'opts': {}},
"shufflenet_v2_x1_0": {'H': 224, 'W': 224, 'opts': {}},
"shufflenet_v2_x1_5": {'H': 224, 'W': 224, 'opts': {}},
"shufflenet_v2_x2_0": {'H': 224, 'W': 224, 'opts': {}},

"squeezenet1_0": {'H': 224, 'W': 224, 'opts': {}},
"squeezenet1_1": {'H': 224, 'W': 224, 'opts': {}},

"vgg11": {'H': 224, 'W': 224, 'opts': {}},
"vgg11_bn": {'H': 224, 'W': 224, 'opts': {}},
"vgg13": {'H': 224, 'W': 224, 'opts': {}},
"vgg13_bn": {'H': 224, 'W': 224, 'opts': {}},
"vgg16": {'H': 224, 'W': 224, 'opts': {}},
"vgg16_bn": {'H': 224, 'W': 224, 'opts': {}},
"vgg19": {'H': 224, 'W': 224, 'opts': {}},
"vgg19_bn": {'H': 224, 'W': 224, 'opts': {}},

"inception_v3": {'H': 299, 'W': 299, 'opts': {'aux_logits': False}},
}

def main():
args = parseArgs()

pyprof.nvtx.init()
pyprof.nvtx.wrap(fused_adam_cuda, 'adam')

N = args.b
C = 3
H = d[args.m]['H']
W = d[args.m]['W']
opts = d[args.m]['opts']
classes = 1000

net = getattr(models, args.m)
net = net(**opts).cuda().half()
net.train()

x = torch.rand(N, C, H, W).cuda().half()
target = torch.empty(N, dtype=torch.long).random_(classes).cuda()

criterion = nn.CrossEntropyLoss().cuda()
if (args.o == "sgd"):
optimizer = torch.optim.SGD(net.parameters(), lr = 0.01, momentum=0.9)
elif (args.o == "adam"):
optimizer = FusedAdam(net.parameters())
optimizer = FP16_Optimizer(optimizer)
else:
assert False

#Warm up without profiler
for i in range(2):
output = net(x)
loss = criterion(output, target)
optimizer.zero_grad()
loss.backward()
optimizer.step()

with torch.autograd.profiler.emit_nvtx():
profiler.start()
output = net(x)
loss = criterion(output, target)
optimizer.zero_grad()
loss.backward()
optimizer.step()
profiler.stop()

if __name__ == "__main__":
main()
Loading

0 comments on commit 880ab92

Please sign in to comment.