Adding PyProf to Apex (NVIDIA#404)

Co-authored-by: Aditya Agrawal <aditya.iitb@gmail.com> Co-authored-by: Marek Kolodziej <mkolod@gmail.com>
lmw0320 · Aug 13, 2019 · 880ab92 · 880ab92
1 parent 4a8c4ac
commit 880ab92
Show file tree

Hide file tree

Showing 67 changed files with 5,987 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -94,6 +94,11 @@ A Python-only build omits:
 - Fused kernels that improve the performance of `apex.parallel.DistributedDataParallel` and `apex.amp`.
 `DistributedDataParallel`, `amp`, and `SyncBatchNorm` will still be usable, but they may be slower.
 
+To enable PyProf support, you need to install the packages required by PyProf. To do so, add the "--pyprof" option at installation time:
+```
+$ pip install -v --no-cache-dir --global-option="--pyprof" --global-option="--cpp_ext" --global-option="--cuda_ext" ./
+```
+
 ### Windows support
 Windows support is experimental, and Linux is recommended.  `pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .` may work if you were able to build Pytorch from source
 on your system.  `pip install -v --no-cache-dir .` (without CUDA/C++ extensions) is more likely to work.  If you installed Pytorch in a Conda environment, make sure to install Apex in that same environment.
diff --git a/apex/__init__.py b/apex/__init__.py
@@ -1,5 +1,6 @@
 # May help avoid undefined symbol errors https://pytorch.org/cppdocs/notes/faq.html#undefined-symbol-errors-from-pytorch-aten
 import torch
+import warnings
 
 from . import parallel
 from . import amp
@@ -14,3 +15,4 @@
 # load time) the error message is timely and visible.
 from . import optimizers
 from . import normalization
+from . import pyprof
diff --git a/apex/pyprof/FAQs.md b/apex/pyprof/FAQs.md
@@ -0,0 +1,21 @@
+1. How do I intercept the Adam optimizer in APEX ?
+
+	```python
+	from apex import pyprof
+	import fused_adam_cuda
+	pyprof.nvtx.wrap(fused_adam_cuda, 'adam')
+	```
+
+2. If you are using JIT and/or AMP, the correct initialization sequence is
+	1. Let any JIT to finish.
+	2. Initlialize pyprof `pyprof.nvtx.init()`.
+	3. Initialize AMP.
+
+3. How do I profile with `torch.distributed.launch` ?
+
+	```python
+	nvprof -f -o net%p.sql \
+		--profile-from-start off \
+		--profile-child-processes \
+		python -m torch.distributed.launch net.py
+	```
diff --git a/apex/pyprof/README.md b/apex/pyprof/README.md
diff --git a/apex/pyprof/__init__.py b/apex/pyprof/__init__.py
@@ -0,0 +1,3 @@
+import warnings
+
+from . import nvtx
diff --git a/apex/pyprof/examples/.gitignore b/apex/pyprof/examples/.gitignore
@@ -0,0 +1,4 @@
+__pycache__
+*.sql
+*.dict
+*.csv
diff --git a/apex/pyprof/examples/apex/README.md b/apex/pyprof/examples/apex/README.md
@@ -0,0 +1 @@
+This directory has examples of how to use `pyprof` with APEX extensions e.g. `fused_adam_cuda` and `fused_layer_norm_cuda`.
diff --git a/apex/pyprof/examples/apex/fused_adam.py b/apex/pyprof/examples/apex/fused_adam.py
@@ -0,0 +1,20 @@
+import torch
+import fused_adam_cuda
+from apex.optimizers import FusedAdam, FP16_Optimizer
+from apex import pyprof
+
+pyprof.nvtx.init()
+pyprof.nvtx.wrap(fused_adam_cuda, 'adam')
+
+model = torch.nn.Linear(10, 20).cuda().half()
+criterion = torch.nn.CrossEntropyLoss().cuda()
+optimizer = FusedAdam(model.parameters())
+optimizer = FP16_Optimizer(optimizer)
+
+x = torch.ones(32, 10).cuda().half()
+target = torch.empty(32, dtype=torch.long).random_(20).cuda()
+y = model(x)
+loss = criterion(y, target)
+optimizer.zero_grad()
+loss.backward()
+optimizer.step()
diff --git a/apex/pyprof/examples/apex/fused_layer_norm.py b/apex/pyprof/examples/apex/fused_layer_norm.py
@@ -0,0 +1,28 @@
+import torch
+import fused_layer_norm_cuda
+from apex.normalization import FusedLayerNorm
+from apex import pyprof
+
+pyprof.nvtx.init()
+pyprof.nvtx.wrap(fused_layer_norm_cuda, 'forward')
+pyprof.nvtx.wrap(fused_layer_norm_cuda, 'backward')
+pyprof.nvtx.wrap(fused_layer_norm_cuda, 'forward_affine')
+pyprof.nvtx.wrap(fused_layer_norm_cuda, 'backward_affine')
+
+input = torch.randn(20, 5, 10, 10).cuda()
+
+# With Learnable Parameters
+m = FusedLayerNorm(input.size()[1:]).cuda()
+output = m(input)
+
+# Without Learnable Parameters
+m = FusedLayerNorm(input.size()[1:], elementwise_affine=False).cuda()
+output = m(input)
+
+# Normalize over last two dimensions
+m = FusedLayerNorm([10, 10]).cuda()
+output = m(input)
+
+# Normalize over last dimension of size 10
+m = FusedLayerNorm(10).cuda()
+output = m(input)
diff --git a/apex/pyprof/examples/apex/test.sh b/apex/pyprof/examples/apex/test.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+set -e
+
+SCRIPT=`realpath $0`
+SCRIPTPATH=`dirname $SCRIPT`
+PYPROF="$SCRIPTPATH/../.."
+
+parse="python $PYPROF/parse/parse.py"
+prof="python $PYPROF/prof/prof.py"
+
+for f in *.py
+do
+	base=`basename $f .py`
+	sql=$base.sql
+	dict=$base.dict
+
+	#NVprof
+	echo "nvprof -fo $sql python $f"
+	nvprof -fo $sql python $f
+
+	#Parse
+	echo $parse $sql
+	$parse $sql > $dict
+
+	#Prof
+	echo $prof $dict
+	$prof -w 130 $dict
+	\rm $sql $dict
+done
diff --git a/apex/pyprof/examples/custom_func_module/README.md b/apex/pyprof/examples/custom_func_module/README.md
@@ -0,0 +1 @@
+This directory has examples which show how to intercept (monkey patch) custom functions and modules with `pyprof`. No changes are required in `pyprof/parse`, however, users can add support for bytes and flops calculation for custom functions and modules in `pyprof/prof` by extending the `OperatorLayerBase` class.
diff --git a/apex/pyprof/examples/custom_func_module/custom_function.py b/apex/pyprof/examples/custom_func_module/custom_function.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+
+import torch
+import torch.cuda.profiler as profiler
+from apex import pyprof
+#Initialize pyprof
+pyprof.nvtx.init()
+
+class Foo(torch.autograd.Function):
+	@staticmethod
+	def forward(ctx, in1, in2):
+		out = in1 + in2		#This could be a custom C/C++ function.
+		return out
+
+	@staticmethod
+	def backward(ctx, grad):
+		in1_grad = grad		#This could be a custom C/C++ function.
+		in2_grad = grad		#This could be a custom C/C++ function.
+		return in1_grad, in2_grad
+
+#Hook the forward and backward functions to pyprof
+pyprof.nvtx.wrap(Foo, 'forward')
+pyprof.nvtx.wrap(Foo, 'backward')
+
+foo = Foo.apply
+
+x = torch.ones(4,4).cuda()
+y = torch.ones(4,4).cuda()
+
+with torch.autograd.profiler.emit_nvtx():
+	profiler.start()
+	z = foo(x,y)
+	profiler.stop()
diff --git a/apex/pyprof/examples/custom_func_module/custom_module.py b/apex/pyprof/examples/custom_func_module/custom_module.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python3
+
+import torch
+import torch.cuda.profiler as profiler
+from apex import pyprof
+pyprof.nvtx.init()
+
+class Foo(torch.nn.Module):
+    def __init__(self, size):
+        super(Foo, self).__init__()
+        self.n = torch.nn.Parameter(torch.ones(size))
+        self.m = torch.nn.Parameter(torch.ones(size))
+
+    def forward(self, input):
+        return self.n*input + self.m
+
+#Hook the forward function to pyprof
+pyprof.nvtx.wrap(Foo, 'forward')
+
+foo = Foo(4)
+foo.cuda()
+x = torch.ones(4).cuda()
+
+with torch.autograd.profiler.emit_nvtx():
+	profiler.start()
+	z = foo(x)
+	profiler.stop()
diff --git a/apex/pyprof/examples/custom_func_module/test.sh b/apex/pyprof/examples/custom_func_module/test.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+set -e
+
+SCRIPT=`realpath $0`
+SCRIPTPATH=`dirname $SCRIPT`
+PYPROF="$SCRIPTPATH/../.."
+
+parse="python $PYPROF/parse/parse.py"
+prof="python $PYPROF/prof/prof.py"
+
+for f in *.py
+do
+	base=`basename $f .py`
+	sql=$base.sql
+	dict=$base.dict
+
+	#NVprof
+	echo "nvprof -fo $sql python $f"
+	nvprof -fo $sql python $f
+
+	#Parse
+	echo $parse $sql
+	$parse $sql > $dict
+
+	#Prof
+	echo $prof $dict
+	$prof -w 130 $dict
+	\rm $sql $dict
+done
diff --git a/apex/pyprof/examples/imagenet/imagenet.py b/apex/pyprof/examples/imagenet/imagenet.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+
+"""
+Example to run pyprof with imagenet models.
+"""
+
+import sys
+import torch
+import torch.nn as nn
+import torchvision.models as models
+import torch.cuda.profiler as profiler
+import argparse
+
+from apex import pyprof
+from apex.optimizers import FusedAdam, FP16_Optimizer
+import fused_adam_cuda
+
+def parseArgs():
+	parser = argparse.ArgumentParser(prog=sys.argv[0], description="Run popular imagenet models.")
+
+	parser.add_argument("-m",
+		type=str,
+		default="resnet50",
+		choices=["alexnet", "densenet121", "densenet161", "densenet169", "densenet201", "googlenet", "mnasnet0_5", "mnasnet0_75", "mnasnet1_0", "mnasnet1_3", "mobilenet_v2", "resnet18", "resnet34", "resnet50", "resnet101", "resnet152", "resnext50_32x4d", "resnext101_32x8d", "wide_resnet50_2", "wide_resnet101_2", "shufflenet_v2_x0_5", "shufflenet_v2_x1_0", "shufflenet_v2_x1_5", "shufflenet_v2_x2_0", "squeezenet1_0", "squeezenet1_1", "vgg11", "vgg11_bn", "vgg13", "vgg13_bn", "vgg16", "vgg16_bn", "vgg19", "vgg19_bn", "inception_v3"],
+		help="Model.")
+
+	parser.add_argument("-b",
+		type=int,
+		default=32,
+		help="Batch size.")
+
+	parser.add_argument("-o",
+		type=str,
+		default="adam",
+		choices=["adam", "sgd"],
+		help="Optimizer.")
+
+	args = parser.parse_args()
+	return args
+
+d = {
+	"alexnet":				{'H': 224, 'W': 224, 'opts': {}},
+
+	"densenet121":			{'H': 224, 'W': 224, 'opts': {}},
+	"densenet161":			{'H': 224, 'W': 224, 'opts': {}},
+	"densenet169":			{'H': 224, 'W': 224, 'opts': {}},
+	"densenet201":			{'H': 224, 'W': 224, 'opts': {}},
+
+	"googlenet":			{'H': 224, 'W': 224, 'opts': {'aux_logits': False}},
+
+	"mnasnet0_5":			{'H': 224, 'W': 224, 'opts': {}},
+	"mnasnet0_75":			{'H': 224, 'W': 224, 'opts': {}},
+	"mnasnet1_0":			{'H': 224, 'W': 224, 'opts': {}},
+	"mnasnet1_3":			{'H': 224, 'W': 224, 'opts': {}},
+
+	"mobilenet_v2":			{'H': 224, 'W': 224, 'opts': {}},
+
+	"resnet18":				{'H': 224, 'W': 224, 'opts': {}},
+	"resnet34":				{'H': 224, 'W': 224, 'opts': {}},
+	"resnet50":				{'H': 224, 'W': 224, 'opts': {}},
+	"resnet101":			{'H': 224, 'W': 224, 'opts': {}},
+	"resnet152":			{'H': 224, 'W': 224, 'opts': {}},
+
+	"resnext50_32x4d":		{'H': 224, 'W': 224, 'opts': {}},
+	"resnext101_32x8d":		{'H': 224, 'W': 224, 'opts': {}},
+
+	"wide_resnet50_2":		{'H': 224, 'W': 224, 'opts': {}},
+	"wide_resnet101_2":		{'H': 224, 'W': 224, 'opts': {}},
+
+	"shufflenet_v2_x0_5": 	{'H': 224, 'W': 224, 'opts': {}},
+	"shufflenet_v2_x1_0": 	{'H': 224, 'W': 224, 'opts': {}},
+	"shufflenet_v2_x1_5": 	{'H': 224, 'W': 224, 'opts': {}},
+	"shufflenet_v2_x2_0":	{'H': 224, 'W': 224, 'opts': {}},
+
+	"squeezenet1_0":		{'H': 224, 'W': 224, 'opts': {}},
+	"squeezenet1_1":		{'H': 224, 'W': 224, 'opts': {}},
+
+	"vgg11":				{'H': 224, 'W': 224, 'opts': {}},
+	"vgg11_bn":				{'H': 224, 'W': 224, 'opts': {}},
+	"vgg13":				{'H': 224, 'W': 224, 'opts': {}},
+	"vgg13_bn":				{'H': 224, 'W': 224, 'opts': {}},
+	"vgg16":				{'H': 224, 'W': 224, 'opts': {}},
+	"vgg16_bn":				{'H': 224, 'W': 224, 'opts': {}},
+	"vgg19":				{'H': 224, 'W': 224, 'opts': {}},
+	"vgg19_bn":				{'H': 224, 'W': 224, 'opts': {}},
+
+	"inception_v3":			{'H': 299, 'W': 299, 'opts': {'aux_logits': False}},
+	}
+
+def main():
+	args = parseArgs()
+
+	pyprof.nvtx.init()
+	pyprof.nvtx.wrap(fused_adam_cuda, 'adam')
+
+	N = args.b
+	C = 3
+	H = d[args.m]['H']
+	W = d[args.m]['W']
+	opts = d[args.m]['opts']
+	classes = 1000
+
+	net = getattr(models, args.m)
+	net = net(**opts).cuda().half()
+	net.train()
+
+	x = torch.rand(N, C, H, W).cuda().half()
+	target = torch.empty(N, dtype=torch.long).random_(classes).cuda()
+
+	criterion = nn.CrossEntropyLoss().cuda()
+	if (args.o == "sgd"):
+		optimizer = torch.optim.SGD(net.parameters(), lr = 0.01, momentum=0.9)
+	elif (args.o == "adam"):
+		optimizer = FusedAdam(net.parameters())
+		optimizer = FP16_Optimizer(optimizer)
+	else:
+		assert False
+
+	#Warm up without profiler
+	for i in range(2):
+		output = net(x)
+		loss = criterion(output, target)
+		optimizer.zero_grad()
+		loss.backward()
+		optimizer.step()
+
+	with torch.autograd.profiler.emit_nvtx():
+		profiler.start()
+		output = net(x)
+		loss = criterion(output, target)
+		optimizer.zero_grad()
+		loss.backward()
+		optimizer.step()
+		profiler.stop()
+
+if __name__ == "__main__":
+	main()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		This directory has examples of how to use `pyprof` with APEX extensions e.g. `fused_adam_cuda` and `fused_layer_norm_cuda`.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		This directory has examples which show how to intercept (monkey patch) custom functions and modules with `pyprof`. No changes are required in `pyprof/parse`, however, users can add support for bytes and flops calculation for custom functions and modules in `pyprof/prof` by extending the `OperatorLayerBase` class.