-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnvprof_time_2.log
145 lines (143 loc) · 35.2 KB
/
nvprof_time_2.log
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
==8604== NVPROF is profiling process 8604, command: /home/lanchuanxin/torch/bin/python detect.py --weights runs/exp1_yolov5s_bdd/weights/best_yolov5s_bdd.pt --source bdd100k/images/test_nvprof --save-txt
==8604== Profiling application: /home/lanchuanxin/torch/bin/python detect.py --weights runs/exp1_yolov5s_bdd/weights/best_yolov5s_bdd.pt --source bdd100k/images/test_nvprof --save-txt
==8604== Profiling result:
"Time(%)","Time","Calls","Avg","Min","Max","Name"
%,ms,,us,us,us,
25.995655,116.030742,2241,51.776000,25.249000,135.364000,"maxwell_fp16_scudnn_fp16_128x128_relu_interior_nn"
10.181729,45.445808,250,181.783000,91.203000,305.034000,"maxwell_fp16_scudnn_fp16_128x128_relu_small_nn"
9.404120,41.974979,3672,11.431000,3.296000,92.739000,"void at::native::unrolled_elementwise_kernel<at::native::BinaryFunctor<c10::Half, c10::Half, c10::Half, at::native::AddFunctor<float>>, at::detail::Array<char*, int=3>, OffsetCalculator<int=2, unsigned int, bool=0>, OffsetCalculator<int=1, unsigned int, bool=0>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, c10::Half, c10::Half, c10::Half, float, at::native::AddFunctor<float>, at::native::BinaryFunctor<c10::Half, c10::Half, c10::Half, at::native::AddFunctor<float>>)"
8.374286,37.378349,816,45.806000,25.344000,73.346000,"maxwell_fp16_scudnn_winograd_fp16_fp32_128x128_ldg1_ldg4_tile148n_nt"
8.289637,37.000521,951,38.906000,17.280000,86.243000,"maxwell_fp16_scudnn_fp16_128x64_relu_interior_nn"
7.379404,32.937723,92,358.018000,108.611000,373.100000,"void cudnn::detail::explicit_convolve_sgemm<__half, int, int=512, int=6, int=8, int=3, int=3, int=5, int=0, bool=1>(int, int, int, __half const *, int, __half const , int, cudnn::detail::explicit_convolve_sgemm<__half, int, int=512, int=6, int=8, int=3, int=3, int=5, int=0, bool=1>*, kernel_conv_params, int, int, float, float, int, __half const *, __half const *)"
3.384082,15.104736,4012,3.764000,1.312000,31.905000,"_ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_81_GLOBAL__N__57_tmpxft_0000c55d_00000000_11_Activation_compute_75_cpp1_ii_ec99c53b17leaky_relu_kernelERNS_18TensorIteratorBaseERKN3c106ScalarEENKUlvE_clEvENKUlvE4_clEvEUlNS5_4HalfEE_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_"
2.425566,10.826432,884,12.247000,4.416000,34.658000,"void at::native::_GLOBAL__N__52_tmpxft_0000d8a1_00000000_11_Shape_compute_75_cpp1_ii_cedd8df2::CatArrayBatchedCopy<c10::Half, unsigned int, int=4, int=128, int=1>(c10::Half*, at::native::_GLOBAL__N__52_tmpxft_0000d8a1_00000000_11_Shape_compute_75_cpp1_ii_cedd8df2::CatArrInputTensorMetadata<at::native::_GLOBAL__N__52_tmpxft_0000d8a1_00000000_11_Shape_compute_75_cpp1_ii_cedd8df2::CatArrayBatchedCopy<c10::Half, unsigned int, int=4, int=128, int=1>, unsigned int, int=128, int=1>, at::native::_GLOBAL__N__52_tmpxft_0000d8a1_00000000_11_Shape_compute_75_cpp1_ii_cedd8df2::TensorSizeStride<at::native::_GLOBAL__N__52_tmpxft_0000d8a1_00000000_11_Shape_compute_75_cpp1_ii_cedd8df2::CatArrInputTensorMetadata, unsigned int=4>, int, at::native::_GLOBAL__N__52_tmpxft_0000d8a1_00000000_11_Shape_compute_75_cpp1_ii_cedd8df2::CatArrInputTensorMetadata)"
2.207259,9.852027,68,144.882000,74.242000,171.558000,"maxwell_fp16_scudnn_fp16_128x64_relu_small_nn"
2.193686,9.791443,883,11.088000,0.736000,409.549000,"[CUDA memcpy HtoD]"
2.156412,9.625075,204,47.181000,9.729000,87.107000,"void at::native::_GLOBAL__N__63_tmpxft_0000c88a_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_6258b574::max_pool_forward_nchw<c10::Half, c10::Half>(int, c10::Half const *, int, int, int, int, int, int, int, int, int, int, int, int, int, int, at::native::_GLOBAL__N__63_tmpxft_0000c88a_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_6258b574::max_pool_forward_nchw<c10::Half, c10::Half>*, long*)"
1.730406,7.723611,884,8.737000,4.672000,18.689000,"void cudnn::winograd::generateWinogradTilesKernel<int=0, __half, float>(cudnn::winograd::GenerateWinogradTilesParams<__half, float>)"
1.704862,7.609593,68,111.905000,68.034000,146.981000,"maxwell_fp16_scudnn_winograd_fp16_fp32_128x128_ldg1_ldg4_tile148t_nt"
1.452118,6.481480,272,23.828000,11.105000,38.017000,"maxwell_fp16_scudnn_fp16_128x32_relu_interior_nn"
1.390899,6.208230,94,66.045000,64.450000,86.339000,"void im2col4d_kernel<__half, int>(im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, __half const *, __half*, int)"
1.308501,5.840450,3782,1.544000,1.184000,10.689000,"cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams)"
1.280054,5.713478,544,10.502000,3.777000,29.729000,"void at::native::batch_norm_transform_input_kernel<c10::Half, c10::Half, float, bool=1, int>(at::GenericPackedTensorAccessor<c10::Half, unsigned long=3, at::RestrictPtrTraits, int>, int, at::native::batch_norm_transform_input_kernel<c10::Half<std::conditional<bool=1, float, c10::Half>::type, unsigned long=1, c10::Half, at::RestrictPtrTraits>, c10::Half, float, bool=1, int>, std::conditional<bool=1, float, c10::Half>::type, at::native::batch_norm_transform_input_kernel<c10::Half<float, unsigned long=1, c10::Half, at::RestrictPtrTraits>, c10::Half, float, bool=1, int>, at::native::batch_norm_transform_input_kernel<c10::Half<std::conditional<bool=1, float, c10::Half>::type, unsigned long=1, c10::Half, at::RestrictPtrTraits>, c10::Half, float, bool=1, int>, std::conditional)"
1.164384,5.197189,958,5.425000,1.344000,22.753000,"_ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE1_clEvENKUlvE6_clEvEUlfE_NS_6detail5ArrayIPcLi2EEE23TrivialOffsetCalculatorILi1EjESC_NS0_6memory12LoadWithCastILi1EEENSD_13StoreWithCastEEEviT_T0_T1_T2_T3_T4_"
0.672130,3.000030,68,44.118000,24.801000,60.930000,"void at::native::_GLOBAL__N__52_tmpxft_0000d8a1_00000000_11_Shape_compute_75_cpp1_ii_cedd8df2::CatArrayBatchedCopy<c10::Half, unsigned int, int=4, int=64, int=64>(c10::Half*, at::native::_GLOBAL__N__52_tmpxft_0000d8a1_00000000_11_Shape_compute_75_cpp1_ii_cedd8df2::CatArrInputTensorMetadata<at::native::_GLOBAL__N__52_tmpxft_0000d8a1_00000000_11_Shape_compute_75_cpp1_ii_cedd8df2::CatArrayBatchedCopy<c10::Half, unsigned int, int=4, int=64, int=64>, unsigned int, int=64, int=64>, at::native::_GLOBAL__N__52_tmpxft_0000d8a1_00000000_11_Shape_compute_75_cpp1_ii_cedd8df2::TensorSizeStride<at::native::_GLOBAL__N__52_tmpxft_0000d8a1_00000000_11_Shape_compute_75_cpp1_ii_cedd8df2::CatArrInputTensorMetadata, unsigned int=4>, int, at::native::_GLOBAL__N__52_tmpxft_0000d8a1_00000000_11_Shape_compute_75_cpp1_ii_cedd8df2::CatArrInputTensorMetadata)"
0.591210,2.638844,408,6.467000,4.672000,13.344000,"_ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE1_clEvENKUlvE18_clEvEUlN3c104HalfEE_NS_6detail5ArrayIPcLi2EEE16OffsetCalculatorILi1EjLb0EESE_NS0_6memory15LoadWithoutCastENSF_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_"
0.513884,2.293706,204,11.243000,9.824000,14.240000,"void at::native::unrolled_elementwise_kernel<at::native::BinaryFunctor<float, float, float, at::native::AddFunctor<float>>, at::detail::Array<char*, int=3>, OffsetCalculator<int=2, unsigned int, bool=0>, OffsetCalculator<int=1, unsigned int, bool=0>, at::native::memory::LoadWithCast<int=2>, at::native::memory::StoreWithCast>(int, float, float, float, float, at::native::AddFunctor<float>, at::native::BinaryFunctor<float, float, float, at::native::AddFunctor<float>>)"
0.465591,2.078149,408,5.093000,4.544000,6.752000,"void at::native::unrolled_elementwise_kernel<at::native::BUnaryFunctor<c10::Half, c10::Half, c10::Half, at::native::MulFunctor<float>>, at::detail::Array<char*, int=2>, OffsetCalculator<int=1, unsigned int, bool=0>, OffsetCalculator<int=1, unsigned int, bool=0>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, c10::Half, c10::Half, c10::Half, float, at::native::MulFunctor<float>, at::native::BUnaryFunctor<c10::Half, c10::Half, c10::Half, at::native::MulFunctor<float>>)"
0.432618,1.930976,204,9.465000,8.321000,11.744000,"_ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE1_clEvENKUlvE18_clEvEUlN3c104HalfEE_NS_6detail5ArrayIPcLi2EEE16OffsetCalculatorILi1EjLb0EESE_NS0_6memory12LoadWithCastILi1EEENSF_13StoreWithCastEEEviT_T0_T1_T2_T3_T4_"
0.411718,1.837689,2014,0.912000,0.864000,1.440000,"[CUDA memcpy DtoH]"
0.402163,1.795042,476,3.771000,2.016000,9.985000,"void at::native::vectorized_elementwise_kernel<int=4, at::native::BinaryFunctor<c10::Half, c10::Half, c10::Half, at::native::AddFunctor<float>>, at::detail::Array<char*, int=3>>(int, c10::Half, c10::Half)"
0.395816,1.766710,226,7.817000,1.408000,23.073000,"_ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE1_clEvENKUlvE18_clEvEUlN3c104HalfEE_NS_6detail5ArrayIPcLi2EEE23TrivialOffsetCalculatorILi1EjESE_NS0_6memory12LoadWithCastILi1EEENSF_13StoreWithCastEEEviT_T0_T1_T2_T3_T4_"
0.324013,1.446222,136,10.633000,5.600000,16.896000,"void at::native::_GLOBAL__N__64_tmpxft_0000e057_00000000_11_UpSampleNearest2d_compute_75_cpp1_ii_f539c38f::upsample_nearest2d_out_frame<c10::Half, float>(c10::Half const *, at::native::_GLOBAL__N__64_tmpxft_0000e057_00000000_11_UpSampleNearest2d_compute_75_cpp1_ii_f539c38f::upsample_nearest2d_out_frame<c10::Half, float>*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, float)"
0.289198,1.290824,204,6.327000,5.504000,8.832000,"void at::native::unrolled_elementwise_kernel<at::native::BinaryFunctor<c10::Half, c10::Half, c10::Half, at::native::MulFunctor<float>>, at::detail::Array<char*, int=3>, OffsetCalculator<int=2, unsigned int, bool=0>, OffsetCalculator<int=1, unsigned int, bool=0>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, c10::Half, c10::Half, c10::Half, float, at::native::MulFunctor<float>, at::native::BinaryFunctor<c10::Half, c10::Half, c10::Half, at::native::MulFunctor<float>>)"
0.260613,1.163238,244,4.767000,2.528000,7.265000,"void at::native::unrolled_elementwise_kernel<at::native::BinaryFunctor<float, float, float, at::native::MulFunctor<float>>, at::detail::Array<char*, int=3>, OffsetCalculator<int=2, unsigned int, bool=0>, OffsetCalculator<int=1, unsigned int, bool=0>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, float, float, float, float, at::native::MulFunctor<float>, at::native::BinaryFunctor<float, float, float, at::native::MulFunctor<float>>)"
0.238036,1.062464,544,1.953000,1.504000,2.560000,"_ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_84_GLOBAL__N__60_tmpxft_0000d3a1_00000000_11_Normalization_compute_75_cpp1_ii_5c38458722batch_norm_calc_invstdERKNS_6TensorES5_dENKUlvE_clEvENKUlvE4_clEvEUlN3c104HalfEE_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_"
0.222479,0.993027,204,4.867000,2.528000,10.688000,"_ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_19sigmoid_kernel_cudaERNS_18TensorIteratorBaseEENKUlvE_clEvENKUlvE8_clEvEUlN3c104HalfEE_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_"
0.151708,0.677142,36,18.809000,18.720000,18.913000,"void at::cuda::detail::cub::DeviceRadixSortSingleTileKernel<at::cuda::detail::cub::DeviceRadixSortPolicy<float, at::cuda::detail::cub::NullType, int>::Policy700, bool=0, float, at::cuda::detail::cub::NullType, int>(int const *, at::cuda::detail::cub::DeviceRadixSortSingleTileKernel<at::cuda::detail::cub::DeviceRadixSortPolicy<float, at::cuda::detail::cub::NullType, int>::Policy700, bool=0, float, at::cuda::detail::cub::NullType, int>*, at::cuda::detail::cub::DeviceRadixSortPolicy<float, at::cuda::detail::cub::NullType, int>::Policy700 const *, at::cuda::detail::cub::DeviceRadixSortSingleTileKernel<at::cuda::detail::cub::DeviceRadixSortPolicy<float, at::cuda::detail::cub::NullType, int>::Policy700, bool=0, float, at::cuda::detail::cub::NullType, int>**, bool=0, int, int)"
0.150353,0.671096,297,2.259000,1.440000,7.072000,"_ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE1_clEvENKUlvE6_clEvEUlfE_NS_6detail5ArrayIPcLi2EEE16OffsetCalculatorILi1EjLb0EESC_NS0_6memory15LoadWithoutCastENSD_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_"
0.145578,0.649783,220,2.953000,2.016000,7.168000,"_ZN2at6native24index_elementwise_kernelILi128ELi4EZNS0_16gpu_index_kernelIZNS0_17index_kernel_implINS0_10OpaqueTypeILi4EEEEEvRNS_14TensorIteratorEN3c108ArrayRefIlEESA_EUlPcSB_lE_EEvS7_SA_SA_RKT_EUliE_EEviT1_"
0.134829,0.601806,68,8.850000,7.072000,11.776000,"void at::native::_GLOBAL__N__52_tmpxft_0000d8a1_00000000_11_Shape_compute_75_cpp1_ii_cedd8df2::CatArrayBatchedCopy<c10::Half, unsigned int, int=3, int=128, int=1>(c10::Half*, at::native::_GLOBAL__N__52_tmpxft_0000d8a1_00000000_11_Shape_compute_75_cpp1_ii_cedd8df2::CatArrInputTensorMetadata<at::native::_GLOBAL__N__52_tmpxft_0000d8a1_00000000_11_Shape_compute_75_cpp1_ii_cedd8df2::CatArrayBatchedCopy<c10::Half, unsigned int, int=3, int=128, int=1>, unsigned int, int=128, int=1>, at::native::_GLOBAL__N__52_tmpxft_0000d8a1_00000000_11_Shape_compute_75_cpp1_ii_cedd8df2::TensorSizeStride<at::native::_GLOBAL__N__52_tmpxft_0000d8a1_00000000_11_Shape_compute_75_cpp1_ii_cedd8df2::CatArrInputTensorMetadata, unsigned int=4>, int, at::native::_GLOBAL__N__52_tmpxft_0000d8a1_00000000_11_Shape_compute_75_cpp1_ii_cedd8df2::CatArrInputTensorMetadata)"
0.128816,0.574965,106,5.424000,2.432000,6.496000,"void at::native::unrolled_elementwise_kernel<at::native::BUnaryFunctor<float, float, bool, at::native::CompareGTFunctor<float>>, at::detail::Array<char*, int=2>, OffsetCalculator<int=1, unsigned int, bool=0>, OffsetCalculator<int=1, unsigned int, bool=0>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, float, float, bool, float, at::native::CompareGTFunctor<float>, at::native::BUnaryFunctor<float, float, bool, at::native::CompareGTFunctor<float>>)"
0.118785,0.530192,67,7.913000,4.992000,10.080000,"void at::native::vectorized_elementwise_kernel<int=4, at::native::BUnaryFunctor<c10::Half, c10::Half, c10::Half, at::native::MulFunctor<float>>, at::detail::Array<char*, int=2>>(int, c10::Half, c10::Half)"
0.104275,0.465428,192,2.424000,1.696000,6.272000,"void at::native::unrolled_elementwise_kernel<at::native::BinaryFunctor<float, float, float, at::native::AddFunctor<float>>, at::detail::Array<char*, int=3>, OffsetCalculator<int=2, unsigned int, bool=0>, OffsetCalculator<int=1, unsigned int, bool=0>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, float, float, float, float, at::native::AddFunctor<float>, at::native::BinaryFunctor<float, float, float, at::native::AddFunctor<float>>)"
0.101800,0.454382,212,2.143000,1.632000,4.736000,"void at::native::unrolled_elementwise_kernel<at::native::BUnaryFunctor<float, float, float, at::native::MulFunctor<float>>, at::detail::Array<char*, int=2>, OffsetCalculator<int=1, unsigned int, bool=0>, OffsetCalculator<int=1, unsigned int, bool=0>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, float, float, float, float, at::native::MulFunctor<float>, at::native::BUnaryFunctor<float, float, float, at::native::MulFunctor<float>>)"
0.101371,0.452465,209,2.164000,1.088000,21.824000,"[CUDA memcpy DtoD]"
0.093571,0.417650,107,3.903000,2.752000,4.833000,"void at::cuda::detail::cub::DeviceSelectSweepKernel<at::cuda::detail::cub::DispatchSelectIf<at::cuda::detail::cub::CountingInputIterator<long, long>, at::cuda::detail::cub::TransformInputIterator<bool, at::native::_GLOBAL__N__54_tmpxft_0000d371_00000000_11_Nonzero_compute_75_cpp1_ii_cba1aaa0::NonZeroOp<bool>, bool*, long>, long*, int*, at::cuda::detail::cub::NullType, at::cuda::detail::cub::NullType, int, bool=0>::PtxSelectIfPolicyT, at::cuda::detail::cub::CountingInputIterator<long, long>, at::cuda::detail::cub::TransformInputIterator<bool, at::native::_GLOBAL__N__54_tmpxft_0000d371_00000000_11_Nonzero_compute_75_cpp1_ii_cba1aaa0::NonZeroOp<bool>, bool*, long>, long*, int*, at::cuda::detail::cub::ScanTileState<int, bool=1>, at::cuda::detail::cub::NullType, at::cuda::detail::cub::NullType, int, bool=0>(long, at::cuda::detail::cub::CountingInputIterator<long, long>, bool, bool, at::native::_GLOBAL__N__54_tmpxft_0000d371_00000000_11_Nonzero_compute_75_cpp1_ii_cba1aaa0::NonZeroOp<bool>, bool*, long, at::cuda::detail::cub::TransformInputIterator<bool, at::native::_GLOBAL__N__54_tmpxft_0000d371_00000000_11_Nonzero_compute_75_cpp1_ii_cba1aaa0::NonZeroOp<bool>, bool*, long>, int)"
0.093275,0.416330,36,11.564000,2.016000,24.097000,"void vision::ops::_GLOBAL__N__57_tmpxft_000001f1_00000000_10_nms_kernel_compute_75_cpp1_ii_143ee75b::nms_kernel_impl<float>(int, double, float const *, unsigned __int64*)"
0.086357,0.385450,36,10.706000,10.432000,10.976000,"void at::native::_GLOBAL__N__54_tmpxft_0000d371_00000000_11_Nonzero_compute_75_cpp1_ii_cba1aaa0::write_indices<int>(long*, at::native::_GLOBAL__N__54_tmpxft_0000d371_00000000_11_Nonzero_compute_75_cpp1_ii_cba1aaa0::TensorDims<int>, int, at::native::_GLOBAL__N__54_tmpxft_0000d371_00000000_11_Nonzero_compute_75_cpp1_ii_cba1aaa0::TensorDims)"
0.082306,0.367372,4,91.843000,61.474000,178.278000,"maxwell_sgemm_128x128_nn"
0.079682,0.355659,204,1.743000,1.344000,12.256000,"_ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_80_GLOBAL__N__56_tmpxft_0000d3e6_00000000_11_PowKernel_compute_75_cpp1_ii_cd5e7a9229pow_tensor_scalar_kernel_implIN3c104HalfES5_EEvRNS_18TensorIteratorBaseET0_EUlS5_E_NS_6detail5ArrayIPcLi2EEEEEviS8_T1_"
0.079037,0.352781,204,1.729000,1.376000,2.240000,"void at::native::vectorized_elementwise_kernel<int=4, at::native::BUnaryFunctor<c10::Half, c10::Half, c10::Half, at::native::AddFunctor<float>>, at::detail::Array<char*, int=2>>(int, c10::Half, c10::Half)"
0.078606,0.350857,43,8.159000,3.777000,19.840000,"sgemm_32x32x32_NN_vec"
0.066299,0.295922,59,5.015000,2.976000,6.528000,"_ZN2at6native13reduce_kernelILi512ELi1ENS0_8ReduceOpIlNS0_14func_wrapper_tIlZNS0_11sum_functorIlllEclERNS_14TensorIteratorEEUlllE_EEjlLi4EEEEEvT1_"
0.059206,0.264263,67,3.944000,3.456000,4.385000,"void at::cuda::detail::cub::DeviceReduceKernel<at::cuda::detail::cub::DeviceReducePolicy<bool, int, int, at::cuda::detail::cub::Sum>::Policy600, at::cuda::detail::cub::TransformInputIterator<bool, at::native::_GLOBAL__N__54_tmpxft_0000d371_00000000_11_Nonzero_compute_75_cpp1_ii_cba1aaa0::NonZeroOp<bool>, bool*, long>, int*, int, at::cuda::detail::cub::Sum>(int, int, at::cuda::detail::cub::Sum, at::cuda::detail::cub::GridEvenShare<int>, at::cuda::detail::cub::DeviceReducePolicy<bool, int, int, at::cuda::detail::cub::Sum>::Policy600)"
0.059005,0.263368,4,65.842000,39.809000,93.763000,"maxwell_sgemm_128x64_nn"
0.054145,0.241673,174,1.388000,1.152000,1.984000,"void at::native::vectorized_elementwise_kernel<int=4, at::native::BUnaryFunctor<float, float, float, at::native::AddFunctor<float>>, at::detail::Array<char*, int=2>>(int, float, float)"
0.048932,0.218407,67,3.259000,3.168000,3.392000,"void at::cuda::detail::cub::DeviceReduceSingleTileKernel<at::cuda::detail::cub::DeviceReducePolicy<bool, int, int, at::cuda::detail::cub::Sum>::Policy600, int*, int*, int, at::cuda::detail::cub::Sum, int>(int, int, at::cuda::detail::cub::Sum, at::cuda::detail::cub::DeviceReducePolicy<bool, int, int, at::cuda::detail::cub::Sum>::Policy600, int*)"
0.047312,0.211177,23,9.181000,8.608000,9.729000,"void at::native::bitonicSortKVInPlace<float, long, int=-2, int=-1, at::native::GTOp<float, bool=1>, unsigned int, int=32>(at::cuda::detail::TensorInfo<float, bool=1>, float, float, float, at::cuda::detail<long, float>, float, float)"
0.041720,0.186216,143,1.302000,0.928000,10.656000,"void at::native::vectorized_elementwise_kernel<int=4, at::native::FillFunctor<float>, at::detail::Array<char*, int=1>>(int, float, at::native::FillFunctor<float>)"
0.040931,0.182694,92,1.985000,1.696000,3.009000,"_ZN2at6native27unrolled_elementwise_kernelIZZZNS0_84_GLOBAL__N__60_tmpxft_0000da84_00000000_11_TensorCompare_compute_75_cpp1_ii_d0af11f724clamp_scalar_kernel_implERNS_18TensorIteratorBaseERKN3c106ScalarES8_ENKUlvE_clEvENKUlvE6_clEvEUlfE_NS_6detail5ArrayIPcLi2EEE16OffsetCalculatorILi1EjLb0EESH_NS0_6memory15LoadWithoutCastENSI_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_"
0.040780,0.182020,36,5.056000,4.928000,5.280000,"void at::cuda::detail::cub::DeviceSelectSweepKernel<at::cuda::detail::cub::DispatchSelectIf<float const *, at::cuda::detail::cub::NullType*, float*, long*, at::cuda::detail::cub::NullType, at::cuda::detail::cub::Equality, int, bool=0>::PtxSelectIfPolicyT, float const *, at::cuda::detail::cub::NullType*, float*, long*, at::cuda::detail::cub::ScanTileState<int, bool=1>, at::cuda::detail::cub::NullType, at::cuda::detail::cub::Equality, int, bool=0>(at::cuda::detail::cub::NullType*, float*, long*, at::cuda::detail::cub::NullType, at::cuda::detail::cub::Equality, int, bool=0, at::cuda::detail::cub::DispatchSelectIf<float const *, at::cuda::detail::cub::NullType*, float*, long*, at::cuda::detail::cub::NullType, at::cuda::detail::cub::Equality, int, bool=0>::PtxSelectIfPolicyT, int)"
0.040278,0.179778,72,2.496000,2.016000,3.136000,"_ZN2at6native24index_elementwise_kernelILi128ELi4EZNS0_16gpu_index_kernelIZNS0_21index_put_kernel_implINS0_10OpaqueTypeILi4EEEEEvRNS_14TensorIteratorEN3c108ArrayRefIlEESA_EUlPcSB_lE_EEvS7_SA_SA_RKT_EUliE_EEviT1_"
0.039225,0.175078,102,1.716000,1.472000,2.976000,"void at::native::vectorized_elementwise_kernel<int=4, at::native::BinaryFunctor<float, float, float, at::native::DivFunctor<float>>, at::detail::Array<char*, int=3>>(int, float, float)"
0.035397,0.157993,102,1.548000,1.376000,3.040000,"_ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_16sqrt_kernel_cudaERNS_18TensorIteratorBaseEENKUlvE_clEvENKUlvE2_clEvEUlfE_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_"
0.034399,0.153540,51,3.010000,2.336000,6.529000,"void gemv2T_kernel_val<int, int, float, float, float, int=128, int=16, int=2, int=2, bool=0, cublasGemvParams<cublasGemvTensor<float const >, cublasGemvTensor<float>, float>>(float const , float, float)"
0.034192,0.152615,59,2.586000,1.728000,3.360000,"_ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE1_clEvENKUlvE10_clEvEUllE_NS_6detail5ArrayIPcLi2EEE23TrivialOffsetCalculatorILi1EjESC_NS0_6memory12LoadWithCastILi1EEENSD_13StoreWithCastEEEviT_T0_T1_T2_T3_T4_"
0.031173,0.139138,107,1.300000,0.992000,1.504000,"void at::cuda::detail::cub::DeviceCompactInitKernel<at::cuda::detail::cub::ScanTileState<int, bool=1>, int*>(int, int, bool=1)"
0.029789,0.132964,104,1.278000,1.120000,2.049000,"void at::native::vectorized_elementwise_kernel<int=4, at::native::BinaryFunctor<float, float, float, at::native::AddFunctor<float>>, at::detail::Array<char*, int=3>>(int, float, float)"
0.029151,0.130115,10,13.011000,12.736000,13.376000,"void at::native::bitonicSortKVInPlace<float, long, int=-2, int=-1, at::native::GTOp<float, bool=1>, unsigned int, int=128>(at::cuda::detail::TensorInfo<float, bool=1>, float, float, float, at::cuda::detail<long, float>, float, float)"
0.028406,0.126788,1,126.788000,126.788000,126.788000,"void cudnn::detail::explicit_convolve_sgemm<__half, int, int=1024, int=6, int=7, int=3, int=3, int=5, int=0, bool=1>(int, int, int, __half const *, int, __half const , int, cudnn::detail::explicit_convolve_sgemm<__half, int, int=1024, int=6, int=7, int=3, int=3, int=5, int=0, bool=1>*, kernel_conv_params, int, int, float, float, int, __half const *, __half const *)"
0.027581,0.123107,40,3.077000,2.720000,4.096000,"void at::cuda::detail::cub::DeviceReduceSingleTileKernel<at::cuda::detail::cub::DeviceReducePolicy<bool, int, int, at::cuda::detail::cub::Sum>::Policy600, at::cuda::detail::cub::TransformInputIterator<bool, at::native::_GLOBAL__N__54_tmpxft_0000d371_00000000_11_Nonzero_compute_75_cpp1_ii_cba1aaa0::NonZeroOp<bool>, bool*, long>, int*, int, at::cuda::detail::cub::Sum, int>(int, int, at::cuda::detail::cub::Sum, at::cuda::detail::cub::DeviceReducePolicy<bool, int, int, at::cuda::detail::cub::Sum>::Policy600, bool)"
0.024069,0.107429,46,2.335000,1.728000,3.105000,"void at::native::unrolled_elementwise_kernel<at::native::BinaryFunctor<float, float, bool, at::native::CompareEqFunctor<float>>, at::detail::Array<char*, int=3>, OffsetCalculator<int=2, unsigned int, bool=0>, OffsetCalculator<int=1, unsigned int, bool=0>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, float, float, bool, float, at::native::CompareEqFunctor<float>, at::native::BinaryFunctor<float, float, bool, at::native::CompareEqFunctor<float>>)"
0.024018,0.107203,36,2.977000,2.784000,3.136000,"_ZN2at6native24index_elementwise_kernelILi128ELi4EZNS0_16gpu_index_kernelIZNS0_17index_kernel_implINS0_10OpaqueTypeILi8EEEEEvRNS_14TensorIteratorEN3c108ArrayRefIlEESA_EUlPcSB_lE_EEvS7_SA_SA_RKT_EUliE_EEviT1_"
0.022368,0.099841,36,2.773000,2.496000,3.008000,"void at::native::_GLOBAL__N__52_tmpxft_0000d8a1_00000000_11_Shape_compute_75_cpp1_ii_cedd8df2::CatArrayBatchedCopy<float, unsigned int, int=2, int=128, int=1>(float*, at::native::_GLOBAL__N__52_tmpxft_0000d8a1_00000000_11_Shape_compute_75_cpp1_ii_cedd8df2::CatArrInputTensorMetadata<at::native::_GLOBAL__N__52_tmpxft_0000d8a1_00000000_11_Shape_compute_75_cpp1_ii_cedd8df2::CatArrayBatchedCopy<float, unsigned int, int=2, int=128, int=1>, unsigned int, int=128, int=1>, at::native::_GLOBAL__N__52_tmpxft_0000d8a1_00000000_11_Shape_compute_75_cpp1_ii_cedd8df2::TensorSizeStride<at::native::_GLOBAL__N__52_tmpxft_0000d8a1_00000000_11_Shape_compute_75_cpp1_ii_cedd8df2::CatArrInputTensorMetadata, unsigned int=4>, int, at::native::_GLOBAL__N__52_tmpxft_0000d8a1_00000000_11_Shape_compute_75_cpp1_ii_cedd8df2::CatArrInputTensorMetadata)"
0.018498,0.082563,14,5.897000,3.072000,11.776000,"void at::native::_GLOBAL__N__55_tmpxft_0000d0ef_00000000_11_Indexing_compute_75_cpp1_ii_ccf56567::indexSelectSmallIndex<float, long, unsigned int, int=2, int=2, int=-2>(at::cuda::detail::TensorInfo<float, unsigned int>, unsigned int, at::cuda::detail<long, float>, int, int, float, long)"
0.017637,0.078722,51,1.543000,1.344000,2.080000,"void at::native::vectorized_elementwise_kernel<int=4, at::native::BinaryFunctor<float, float, float, at::native::MulFunctor<float>>, at::detail::Array<char*, int=3>>(int, float, float)"
0.016110,0.071907,51,1.409000,1.152000,2.016000,"void at::native::copy_to_diagonal_kernel<float>(float*, float, long, long, long, long)"
0.015049,0.067171,23,2.920000,2.720000,3.201000,"_ZN2at6native27unrolled_elementwise_kernelIZZZNS0_17round_kernel_cudaERNS_18TensorIteratorBaseEENKUlvE_clEvENKUlvE2_clEvEUlfE_NS_6detail5ArrayIPcLi2EEE16OffsetCalculatorILi1EjLb0EESC_NS0_6memory15LoadWithoutCastENSD_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_"
0.014038,0.062657,22,2.848000,2.752000,3.072000,"void at::native::_GLOBAL__N__55_tmpxft_0000d0ef_00000000_11_Indexing_compute_75_cpp1_ii_ccf56567::indexSelectLargeIndex<float, long, unsigned int, int=2, int=2, int=-2, bool=1>(at::cuda::detail::TensorInfo<float, unsigned int>, unsigned int, at::cuda::detail<long, float>, int, int, float, float, long)"
0.011263,0.050274,36,1.396000,1.344000,1.472000,"_ZN85_GLOBAL__N__61_tmpxft_0000d511_00000000_11_RangeFactories_compute_75_cpp1_ii_a1dd93ac29elementwise_kernel_with_indexIiZZZN2at6native15arange_cuda_outERKN3c106ScalarES6_S6_RNS1_6TensorEENKUlvE_clEvENKUlvE10_clEvEUllE_EEvT_T0_PN15function_traitsISD_E11result_typeE"
0.009808,0.043777,36,1.216000,1.184000,1.312000,"void at::cuda::detail::cub::DeviceCompactInitKernel<at::cuda::detail::cub::ScanTileState<int, bool=1>, long*>(int, int, bool=1)"
0.008854,0.039521,21,1.881000,1.792000,2.016000,"void at::native::unrolled_elementwise_kernel<at::native::BUnaryFunctor<long, long, long, at::native::AddFunctor<long>>, at::detail::Array<char*, int=2>, TrivialOffsetCalculator<int=1, unsigned int>, TrivialOffsetCalculator<int=1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, long, long, long, long, at::native::AddFunctor<long>, at::native::BUnaryFunctor<long, long, long, at::native::AddFunctor<long>>)"
0.008331,0.037185,1,37.185000,37.185000,37.185000,"void cudnn::detail::explicit_convolve_sgemm<__half, int, int=1024, int=5, int=5, int=3, int=3, int=3, int=0, bool=1>(int, int, int, __half const *, int, __half const , int, cudnn::detail::explicit_convolve_sgemm<__half, int, int=1024, int=5, int=5, int=3, int=3, int=3, int=0, bool=1>*, kernel_conv_params, int, int, float, float, int, __half const *, __half const *)"
0.008159,0.036419,26,1.400000,1.120000,1.728000,"_ZN2at6native27unrolled_elementwise_kernelIZZZNS0_84_GLOBAL__N__60_tmpxft_0000da84_00000000_11_TensorCompare_compute_75_cpp1_ii_d0af11f724clamp_scalar_kernel_implERNS_18TensorIteratorBaseERKN3c106ScalarES8_ENKUlvE_clEvENKUlvE6_clEvEUlfE_NS_6detail5ArrayIPcLi2EEE23TrivialOffsetCalculatorILi1EjESH_NS0_6memory15LoadWithoutCastENSI_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_"
0.005843,0.026081,13,2.006000,1.952000,2.208000,"_ZN2at6native29vectorized_elementwise_kernelILi2EZZZNS0_84_GLOBAL__N__60_tmpxft_0000da84_00000000_11_TensorCompare_compute_75_cpp1_ii_d0af11f724clamp_scalar_kernel_implERNS_18TensorIteratorBaseERKN3c106ScalarES8_ENKUlvE_clEvENKUlvE6_clEvEUlfE_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_"
0.005700,0.025441,13,1.957000,1.888000,2.176000,"_ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_84_GLOBAL__N__60_tmpxft_0000da84_00000000_11_TensorCompare_compute_75_cpp1_ii_d0af11f724clamp_scalar_kernel_implERNS_18TensorIteratorBaseERKN3c106ScalarES8_ENKUlvE_clEvENKUlvE6_clEvEUlfE_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_"
0.005470,0.024417,13,1.878000,1.824000,1.952000,"_ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_17round_kernel_cudaERNS_18TensorIteratorBaseEENKUlvE_clEvENKUlvE2_clEvEUlfE_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_"
0.005370,0.023969,13,1.843000,1.760000,2.080000,"void at::native::vectorized_elementwise_kernel<int=4, at::native::BUnaryFunctor<float, float, float, at::native::MulFunctor<float>>, at::detail::Array<char*, int=2>>(int, float, float)"
0.005342,0.023843,13,1.834000,1.792000,1.984000,"void at::native::unrolled_elementwise_kernel<at::native::BinaryFunctor<float, float, bool, at::native::CompareEqFunctor<float>>, at::detail::Array<char*, int=3>, TrivialOffsetCalculator<int=2, unsigned int>, TrivialOffsetCalculator<int=1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, float, float, bool, float, at::native::CompareEqFunctor<float>, at::native::BinaryFunctor<float, float, bool, at::native::CompareEqFunctor<float>>)"
0.004101,0.018305,8,2.288000,2.144000,2.368000,"void at::native::vectorized_elementwise_kernel<int=2, at::native::BUnaryFunctor<long, long, long, at::native::AddFunctor<long>>, at::detail::Array<char*, int=2>>(int, long, long)"
0.003771,0.016833,7,2.404000,2.368000,2.497000,"void at::native::vectorized_elementwise_kernel<int=4, at::native::BUnaryFunctor<long, long, long, at::native::AddFunctor<long>>, at::detail::Array<char*, int=2>>(int, long, long)"
0.001735,0.007744,5,1.548000,1.216000,1.728000,"void at::native::unrolled_elementwise_kernel<at::native::BUnaryFunctor<float, float, float, at::native::MulFunctor<float>>, at::detail::Array<char*, int=2>, TrivialOffsetCalculator<int=1, unsigned int>, TrivialOffsetCalculator<int=1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, float, float, float, float, at::native::MulFunctor<float>, at::native::BUnaryFunctor<float, float, float, at::native::MulFunctor<float>>)"
0.001319,0.005888,1,5.888000,5.888000,5.888000,"void at::native::reduce_kernel<int=512, int=1, at::native::ReduceOp<float, at::native::func_wrapper_t<float, at::native::MaxNanFunctor<float>>, unsigned int, float, int=4>>(float)"
0.000667,0.002976,2,1.488000,1.216000,1.760000,"void at::native::unrolled_elementwise_kernel<at::native::BinaryFunctor<float, float, float, at::native::AddFunctor<float>>, at::detail::Array<char*, int=3>, TrivialOffsetCalculator<int=2, unsigned int>, TrivialOffsetCalculator<int=1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, float, float, float, float, at::native::AddFunctor<float>, at::native::BinaryFunctor<float, float, float, at::native::AddFunctor<float>>)"
0.000667,0.002976,2,1.488000,1.184000,1.792000,"void at::native::vectorized_elementwise_kernel<int=2, at::native::BUnaryFunctor<float, float, float, at::native::MulFunctor<float>>, at::detail::Array<char*, int=2>>(int, float, float)"
0.000636,0.002839,4,0.709000,0.428000,1.263000,"[CUDA memset]"
0.000380,0.001696,1,1.696000,1.696000,1.696000,"void at::native::unrolled_elementwise_kernel<at::native::BUnaryFunctor<float, float, bool, at::native::CompareGTFunctor<float>>, at::detail::Array<char*, int=2>, TrivialOffsetCalculator<int=1, unsigned int>, TrivialOffsetCalculator<int=1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, float, float, bool, float, at::native::CompareGTFunctor<float>, at::native::BUnaryFunctor<float, float, bool, at::native::CompareGTFunctor<float>>)"
==8604== System profiling result:
"","Device","Count","Avg","Min","Max"
"SM Clock (MHz)","TITAN Xp (0)",89,1442.000000,1442.000000,1442.000000
"Memory Clock (MHz)","TITAN Xp (0)",89,5514.539326,5508.000000,5702.000000
"Temperature (C)","TITAN Xp (0)",177,41.028249,40.000000,42.000000
"Power (mW)","TITAN Xp (0)",177,64383.378531,62296.000000,77415.000000
"Fan (%)","TITAN Xp (0)",89,23.000000,23.000000,23.000000
==8604== API calls:
"Time(%)","Time","Calls","Avg","Min","Max","Name"
%,s,,ms,ms,s,
86.251120,4.987966,24,207.831916,0.005020,4.981656,"cudaMalloc"
6.151723,0.355759,7,50.822669,0.000635,0.248001,"cudaFree"
4.786640,0.276815,26753,0.010347,0.005790,0.000685,"cudaLaunchKernel"
1.125411,0.065083,152165,0.000427,0.000260,0.000803,"cudaGetDevice"
0.801920,0.046376,2728,0.016999,0.008291,0.000400,"cudaMemcpyAsync"
0.250020,0.014459,2519,0.005739,0.001447,0.000114,"cudaStreamSynchronize"
0.216138,0.012499,4760,0.002625,0.001864,0.000611,"cudaEventRecord"
0.167096,0.009663,37671,0.000256,0.000105,0.000644,"cudaGetLastError"
0.078750,0.004554,378,0.012048,0.006122,0.000200,"cudaMemcpy"
0.038151,0.002206,286,0.007714,0.000128,0.001007,"cuDeviceGetAttribute"
0.036770,0.002126,3,0.708814,0.448601,0.001228,"cuDeviceTotalMem"
0.021226,0.001228,2,0.613764,0.601505,0.000626,"cudaGetDeviceProperties"
0.017935,0.001037,1,1.037171,1.037171,0.001037,"cudaHostAlloc"
0.013847,0.000801,134,0.005975,0.004876,0.000016,"cudaDeviceSynchronize"
0.010922,0.000632,188,0.003359,0.001488,0.000016,"cudaBindTexture"
0.006975,0.000403,3,0.134450,0.047901,0.000305,"cuDeviceGetName"
0.006309,0.000365,277,0.001317,0.000582,0.000005,"cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags"
0.005327,0.000308,603,0.000510,0.000276,0.000004,"cudaDeviceGetAttribute"
0.003598,0.000208,992,0.000209,0.000126,0.000005,"cudaPeekAtLastError"
0.003431,0.000198,188,0.001055,0.000618,0.000003,"cudaUnbindTexture"
0.003147,0.000182,4,0.045494,0.002310,0.000173,"cudaStreamCreateWithPriority"
0.000870,0.000050,8,0.006286,0.002068,0.000027,"cudaStreamCreateWithFlags"
0.000657,0.000038,48,0.000791,0.000438,0.000007,"cudaEventCreateWithFlags"
0.000535,0.000031,188,0.000164,0.000095,0.000000,"cudaCreateChannelDesc"
0.000510,0.000029,4,0.007368,0.003046,0.000015,"cudaMemsetAsync"
0.000203,0.000012,5,0.002351,0.000238,0.000008,"cuDeviceGetCount"
0.000162,0.000009,1,0.009340,0.009340,0.000009,"cuDeviceGetPCIBusId"
0.000157,0.000009,2,0.004540,0.001677,0.000007,"cudaSetDevice"
0.000133,0.000008,1,0.007665,0.007665,0.000008,"cudaFuncGetAttributes"
0.000108,0.000006,5,0.001253,0.000169,0.000005,"cudaGetDeviceCount"
0.000052,0.000003,5,0.000599,0.000234,0.000001,"cuDeviceGet"
0.000045,0.000003,1,0.002618,0.002618,0.000003,"cudaDeviceGetStreamPriorityRange"
0.000043,0.000002,2,0.001239,0.001131,0.000001,"cuInit"
0.000040,0.000002,1,0.002288,0.002288,0.000002,"cudaHostGetDevicePointer"
0.000017,0.000001,3,0.000336,0.000186,0.000001,""
0.000017,0.000001,2,0.000479,0.000429,0.000001,"cuDriverGetVersion"