forked from NVIDIA/apex
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Co-authored-by: Aditya Agrawal <aditya.iitb@gmail.com> Co-authored-by: Marek Kolodziej <mkolod@gmail.com>
- Loading branch information
Showing
67 changed files
with
5,987 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
1. How do I intercept the Adam optimizer in APEX ? | ||
|
||
```python | ||
from apex import pyprof | ||
import fused_adam_cuda | ||
pyprof.nvtx.wrap(fused_adam_cuda, 'adam') | ||
``` | ||
|
||
2. If you are using JIT and/or AMP, the correct initialization sequence is | ||
1. Let any JIT to finish. | ||
2. Initlialize pyprof `pyprof.nvtx.init()`. | ||
3. Initialize AMP. | ||
|
||
3. How do I profile with `torch.distributed.launch` ? | ||
|
||
```python | ||
nvprof -f -o net%p.sql \ | ||
--profile-from-start off \ | ||
--profile-child-processes \ | ||
python -m torch.distributed.launch net.py | ||
``` |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
import warnings | ||
|
||
from . import nvtx |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
__pycache__ | ||
*.sql | ||
*.dict | ||
*.csv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
This directory has examples of how to use `pyprof` with APEX extensions e.g. `fused_adam_cuda` and `fused_layer_norm_cuda`. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
import torch | ||
import fused_adam_cuda | ||
from apex.optimizers import FusedAdam, FP16_Optimizer | ||
from apex import pyprof | ||
|
||
pyprof.nvtx.init() | ||
pyprof.nvtx.wrap(fused_adam_cuda, 'adam') | ||
|
||
model = torch.nn.Linear(10, 20).cuda().half() | ||
criterion = torch.nn.CrossEntropyLoss().cuda() | ||
optimizer = FusedAdam(model.parameters()) | ||
optimizer = FP16_Optimizer(optimizer) | ||
|
||
x = torch.ones(32, 10).cuda().half() | ||
target = torch.empty(32, dtype=torch.long).random_(20).cuda() | ||
y = model(x) | ||
loss = criterion(y, target) | ||
optimizer.zero_grad() | ||
loss.backward() | ||
optimizer.step() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
import torch | ||
import fused_layer_norm_cuda | ||
from apex.normalization import FusedLayerNorm | ||
from apex import pyprof | ||
|
||
pyprof.nvtx.init() | ||
pyprof.nvtx.wrap(fused_layer_norm_cuda, 'forward') | ||
pyprof.nvtx.wrap(fused_layer_norm_cuda, 'backward') | ||
pyprof.nvtx.wrap(fused_layer_norm_cuda, 'forward_affine') | ||
pyprof.nvtx.wrap(fused_layer_norm_cuda, 'backward_affine') | ||
|
||
input = torch.randn(20, 5, 10, 10).cuda() | ||
|
||
# With Learnable Parameters | ||
m = FusedLayerNorm(input.size()[1:]).cuda() | ||
output = m(input) | ||
|
||
# Without Learnable Parameters | ||
m = FusedLayerNorm(input.size()[1:], elementwise_affine=False).cuda() | ||
output = m(input) | ||
|
||
# Normalize over last two dimensions | ||
m = FusedLayerNorm([10, 10]).cuda() | ||
output = m(input) | ||
|
||
# Normalize over last dimension of size 10 | ||
m = FusedLayerNorm(10).cuda() | ||
output = m(input) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
#!/bin/bash | ||
|
||
set -e | ||
|
||
SCRIPT=`realpath $0` | ||
SCRIPTPATH=`dirname $SCRIPT` | ||
PYPROF="$SCRIPTPATH/../.." | ||
|
||
parse="python $PYPROF/parse/parse.py" | ||
prof="python $PYPROF/prof/prof.py" | ||
|
||
for f in *.py | ||
do | ||
base=`basename $f .py` | ||
sql=$base.sql | ||
dict=$base.dict | ||
|
||
#NVprof | ||
echo "nvprof -fo $sql python $f" | ||
nvprof -fo $sql python $f | ||
|
||
#Parse | ||
echo $parse $sql | ||
$parse $sql > $dict | ||
|
||
#Prof | ||
echo $prof $dict | ||
$prof -w 130 $dict | ||
\rm $sql $dict | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
This directory has examples which show how to intercept (monkey patch) custom functions and modules with `pyprof`. No changes are required in `pyprof/parse`, however, users can add support for bytes and flops calculation for custom functions and modules in `pyprof/prof` by extending the `OperatorLayerBase` class. |
33 changes: 33 additions & 0 deletions
33
apex/pyprof/examples/custom_func_module/custom_function.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import torch | ||
import torch.cuda.profiler as profiler | ||
from apex import pyprof | ||
#Initialize pyprof | ||
pyprof.nvtx.init() | ||
|
||
class Foo(torch.autograd.Function): | ||
@staticmethod | ||
def forward(ctx, in1, in2): | ||
out = in1 + in2 #This could be a custom C/C++ function. | ||
return out | ||
|
||
@staticmethod | ||
def backward(ctx, grad): | ||
in1_grad = grad #This could be a custom C/C++ function. | ||
in2_grad = grad #This could be a custom C/C++ function. | ||
return in1_grad, in2_grad | ||
|
||
#Hook the forward and backward functions to pyprof | ||
pyprof.nvtx.wrap(Foo, 'forward') | ||
pyprof.nvtx.wrap(Foo, 'backward') | ||
|
||
foo = Foo.apply | ||
|
||
x = torch.ones(4,4).cuda() | ||
y = torch.ones(4,4).cuda() | ||
|
||
with torch.autograd.profiler.emit_nvtx(): | ||
profiler.start() | ||
z = foo(x,y) | ||
profiler.stop() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import torch | ||
import torch.cuda.profiler as profiler | ||
from apex import pyprof | ||
pyprof.nvtx.init() | ||
|
||
class Foo(torch.nn.Module): | ||
def __init__(self, size): | ||
super(Foo, self).__init__() | ||
self.n = torch.nn.Parameter(torch.ones(size)) | ||
self.m = torch.nn.Parameter(torch.ones(size)) | ||
|
||
def forward(self, input): | ||
return self.n*input + self.m | ||
|
||
#Hook the forward function to pyprof | ||
pyprof.nvtx.wrap(Foo, 'forward') | ||
|
||
foo = Foo(4) | ||
foo.cuda() | ||
x = torch.ones(4).cuda() | ||
|
||
with torch.autograd.profiler.emit_nvtx(): | ||
profiler.start() | ||
z = foo(x) | ||
profiler.stop() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
#!/bin/bash | ||
|
||
set -e | ||
|
||
SCRIPT=`realpath $0` | ||
SCRIPTPATH=`dirname $SCRIPT` | ||
PYPROF="$SCRIPTPATH/../.." | ||
|
||
parse="python $PYPROF/parse/parse.py" | ||
prof="python $PYPROF/prof/prof.py" | ||
|
||
for f in *.py | ||
do | ||
base=`basename $f .py` | ||
sql=$base.sql | ||
dict=$base.dict | ||
|
||
#NVprof | ||
echo "nvprof -fo $sql python $f" | ||
nvprof -fo $sql python $f | ||
|
||
#Parse | ||
echo $parse $sql | ||
$parse $sql > $dict | ||
|
||
#Prof | ||
echo $prof $dict | ||
$prof -w 130 $dict | ||
\rm $sql $dict | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
#!/usr/bin/env python3 | ||
|
||
""" | ||
Example to run pyprof with imagenet models. | ||
""" | ||
|
||
import sys | ||
import torch | ||
import torch.nn as nn | ||
import torchvision.models as models | ||
import torch.cuda.profiler as profiler | ||
import argparse | ||
|
||
from apex import pyprof | ||
from apex.optimizers import FusedAdam, FP16_Optimizer | ||
import fused_adam_cuda | ||
|
||
def parseArgs(): | ||
parser = argparse.ArgumentParser(prog=sys.argv[0], description="Run popular imagenet models.") | ||
|
||
parser.add_argument("-m", | ||
type=str, | ||
default="resnet50", | ||
choices=["alexnet", "densenet121", "densenet161", "densenet169", "densenet201", "googlenet", "mnasnet0_5", "mnasnet0_75", "mnasnet1_0", "mnasnet1_3", "mobilenet_v2", "resnet18", "resnet34", "resnet50", "resnet101", "resnet152", "resnext50_32x4d", "resnext101_32x8d", "wide_resnet50_2", "wide_resnet101_2", "shufflenet_v2_x0_5", "shufflenet_v2_x1_0", "shufflenet_v2_x1_5", "shufflenet_v2_x2_0", "squeezenet1_0", "squeezenet1_1", "vgg11", "vgg11_bn", "vgg13", "vgg13_bn", "vgg16", "vgg16_bn", "vgg19", "vgg19_bn", "inception_v3"], | ||
help="Model.") | ||
|
||
parser.add_argument("-b", | ||
type=int, | ||
default=32, | ||
help="Batch size.") | ||
|
||
parser.add_argument("-o", | ||
type=str, | ||
default="adam", | ||
choices=["adam", "sgd"], | ||
help="Optimizer.") | ||
|
||
args = parser.parse_args() | ||
return args | ||
|
||
d = { | ||
"alexnet": {'H': 224, 'W': 224, 'opts': {}}, | ||
|
||
"densenet121": {'H': 224, 'W': 224, 'opts': {}}, | ||
"densenet161": {'H': 224, 'W': 224, 'opts': {}}, | ||
"densenet169": {'H': 224, 'W': 224, 'opts': {}}, | ||
"densenet201": {'H': 224, 'W': 224, 'opts': {}}, | ||
|
||
"googlenet": {'H': 224, 'W': 224, 'opts': {'aux_logits': False}}, | ||
|
||
"mnasnet0_5": {'H': 224, 'W': 224, 'opts': {}}, | ||
"mnasnet0_75": {'H': 224, 'W': 224, 'opts': {}}, | ||
"mnasnet1_0": {'H': 224, 'W': 224, 'opts': {}}, | ||
"mnasnet1_3": {'H': 224, 'W': 224, 'opts': {}}, | ||
|
||
"mobilenet_v2": {'H': 224, 'W': 224, 'opts': {}}, | ||
|
||
"resnet18": {'H': 224, 'W': 224, 'opts': {}}, | ||
"resnet34": {'H': 224, 'W': 224, 'opts': {}}, | ||
"resnet50": {'H': 224, 'W': 224, 'opts': {}}, | ||
"resnet101": {'H': 224, 'W': 224, 'opts': {}}, | ||
"resnet152": {'H': 224, 'W': 224, 'opts': {}}, | ||
|
||
"resnext50_32x4d": {'H': 224, 'W': 224, 'opts': {}}, | ||
"resnext101_32x8d": {'H': 224, 'W': 224, 'opts': {}}, | ||
|
||
"wide_resnet50_2": {'H': 224, 'W': 224, 'opts': {}}, | ||
"wide_resnet101_2": {'H': 224, 'W': 224, 'opts': {}}, | ||
|
||
"shufflenet_v2_x0_5": {'H': 224, 'W': 224, 'opts': {}}, | ||
"shufflenet_v2_x1_0": {'H': 224, 'W': 224, 'opts': {}}, | ||
"shufflenet_v2_x1_5": {'H': 224, 'W': 224, 'opts': {}}, | ||
"shufflenet_v2_x2_0": {'H': 224, 'W': 224, 'opts': {}}, | ||
|
||
"squeezenet1_0": {'H': 224, 'W': 224, 'opts': {}}, | ||
"squeezenet1_1": {'H': 224, 'W': 224, 'opts': {}}, | ||
|
||
"vgg11": {'H': 224, 'W': 224, 'opts': {}}, | ||
"vgg11_bn": {'H': 224, 'W': 224, 'opts': {}}, | ||
"vgg13": {'H': 224, 'W': 224, 'opts': {}}, | ||
"vgg13_bn": {'H': 224, 'W': 224, 'opts': {}}, | ||
"vgg16": {'H': 224, 'W': 224, 'opts': {}}, | ||
"vgg16_bn": {'H': 224, 'W': 224, 'opts': {}}, | ||
"vgg19": {'H': 224, 'W': 224, 'opts': {}}, | ||
"vgg19_bn": {'H': 224, 'W': 224, 'opts': {}}, | ||
|
||
"inception_v3": {'H': 299, 'W': 299, 'opts': {'aux_logits': False}}, | ||
} | ||
|
||
def main(): | ||
args = parseArgs() | ||
|
||
pyprof.nvtx.init() | ||
pyprof.nvtx.wrap(fused_adam_cuda, 'adam') | ||
|
||
N = args.b | ||
C = 3 | ||
H = d[args.m]['H'] | ||
W = d[args.m]['W'] | ||
opts = d[args.m]['opts'] | ||
classes = 1000 | ||
|
||
net = getattr(models, args.m) | ||
net = net(**opts).cuda().half() | ||
net.train() | ||
|
||
x = torch.rand(N, C, H, W).cuda().half() | ||
target = torch.empty(N, dtype=torch.long).random_(classes).cuda() | ||
|
||
criterion = nn.CrossEntropyLoss().cuda() | ||
if (args.o == "sgd"): | ||
optimizer = torch.optim.SGD(net.parameters(), lr = 0.01, momentum=0.9) | ||
elif (args.o == "adam"): | ||
optimizer = FusedAdam(net.parameters()) | ||
optimizer = FP16_Optimizer(optimizer) | ||
else: | ||
assert False | ||
|
||
#Warm up without profiler | ||
for i in range(2): | ||
output = net(x) | ||
loss = criterion(output, target) | ||
optimizer.zero_grad() | ||
loss.backward() | ||
optimizer.step() | ||
|
||
with torch.autograd.profiler.emit_nvtx(): | ||
profiler.start() | ||
output = net(x) | ||
loss = criterion(output, target) | ||
optimizer.zero_grad() | ||
loss.backward() | ||
optimizer.step() | ||
profiler.stop() | ||
|
||
if __name__ == "__main__": | ||
main() |
Oops, something went wrong.