microsoft · inkcherry · Apr 3, 2024 · Apr 23, 2024 · Nov 19, 2024 · Nov 19, 2024
@@ -32,12 +32,12 @@
 from .runtime.hybrid_engine import DeepSpeedHybridEngine
 from .runtime.pipe.engine import PipelineEngine
 from .inference.engine import InferenceEngine
-from .inference.config import DeepSpeedInferenceConfig
+from .inference.config import DeepSpeedInferenceConfig, AUTOTP_MODE
 from .runtime.lr_schedules import add_tuning_arguments
 from .runtime.config import DeepSpeedConfig, DeepSpeedConfigError
 from .runtime.activation_checkpointing import checkpointing
 from .ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
-from .module_inject import replace_transformer_layer, revert_transformer_layer
+from .module_inject import replace_transformer_layer, revert_transformer_layer, set_autotp_mode
 
 from .utils import log_dist, OnDevice, logger
 from .comm.comm import init_distributed
@@ -364,3 +364,26 @@ def init_inference(model, config=None, **kwargs):
     engine = InferenceEngine(model, config=ds_inference_config)
 
     return engine
+
+
+def tp_model_init(model, tp_size, dtype):
+    """
+    Initialize the model for tensor parallelism.
+
+    Args:
+        model (torch.nn.Module): The model to be initialized.
+        tp_size (int): The tensor parallelism size.
+        dtype (torch.dtype): The data type to be used for the model.
+
+    Returns:
+        torch.nn.Module: The initialized model with tensor parallelism.
+    """
+    # avoid re-entry
+    assert not hasattr(
+        model, 'ds_autotp_parsed'), "ds_autotp_parsed' attribute already exists in the model, re-entry is not allowed."
+
+    set_autotp_mode(training=True)
+    model = init_inference(model=model, mp_size=tp_size, dtype=dtype, replace_with_kernel_inject=False).module
+    setattr(model, 'ds_autotp_parsed', True)
+
+    return model
@@ -224,6 +224,12 @@ def broadcast(tensor, src, group=None, async_op=False, prof=False, log_name='bro
     return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
 
 
+@timed_op
+def broadcast_object_list(object_list, src, group=None, device=None):
+    global cdb
+    return cdb.broadcast_object_list(object_list=object_list, src=src, group=group, device=device)
+
+
 @timed_op
 def all_gather(tensor_list,
                tensor,

@@ -205,6 +205,10 @@ def broadcast(self, tensor, src, group=None, async_op=False):
         else:
             return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
 
+    @disable_compiler_collective
+    def broadcast_object_list(self, object_list, src, group=None, device=None):
+        return torch.distributed.broadcast_object_list(object_list=object_list, src=src, group=group, device=device)
+
     @disable_compiler_collective
     def all_gather(self, tensor_list, tensor, group=None, async_op=False):
         if DS_COMM_ALL_GATHER_OFF:

@@ -31,6 +31,11 @@ class MoETypeEnum(str, Enum):
     standard = "standard"
 
 
+class AUTOTP_MODE(Enum):
+    TRAINING = "TRAINING"
+    INFERENCE = "INFERENCE"
+
+
 class DeepSpeedTPConfig(DeepSpeedConfigModel):
     """ Configure tensor parallelism settings """
 

@@ -15,7 +15,8 @@
 from deepspeed.runtime.checkpoint_engine.torch_checkpoint_engine import TorchCheckpointEngine
 from deepspeed.utils.timer import SynchronizedWallClockTimer
 from deepspeed.runtime.compiler import is_compile_supported
-
+from deepspeed.utils import groups
+from deepspeed.module_inject.layers import is_autotp_training_mode
 from ..runtime.state_dict_factory import SDLoaderFactory
 from ..runtime.weight_quantizer import WeightQuantization
 from ..module_inject import replace_transformer_layer, generic_injection
@@ -247,6 +248,11 @@ def _post_forward_hook(self, module, input, output):
         self._model_times.append(elapsed_time)
 
     def _create_model_parallel_group(self, config):
+
+        if is_autotp_training_mode():
+            groups._init_tp_mesh_device(config.tensor_parallel.tp_size)
+            self.mp_group = groups.get_tensor_model_parallel_group()
+            return
         # Call the init process
         if InferenceEngine.inference_mp_group is None:
             init_distributed()

@@ -6,5 +6,5 @@
 from .replace_module import replace_transformer_layer, revert_transformer_layer, ReplaceWithTensorSlicing, GroupQuantizer, generic_injection
 from .module_quantize import quantize_transformer_layer
 from .replace_policy import HFBertLayerPolicy
-from .layers import LinearAllreduce, LinearLayer, EmbeddingLayer, Normalize
+from .layers import LinearAllreduce, LinearLayer, EmbeddingLayer, Normalize, set_autotp_mode
 from .policy import DSPolicy
@@ -11,10 +11,12 @@
 from typing import Optional
 import torch
 from deepspeed import comm as dist
-from .layers import LinearAllreduce, LinearLayer, LmHeadLinearAllreduce
+from .layers import LinearAllreduce, LinearLayer, LmHeadLinearAllreduce, Yuan_LinearALlreduce, Yuan_LinearLayer, GLM_LinearLayer, Conv_LinearALlreduce, fused_LinearLayer, conv_LinearLayer
 from deepspeed.accelerator import get_accelerator
-from .fusedqkv_utils import require_tp_fused_qkvw, prepare_tp_fused_qkvw, shard_value_with_share_qk, shard_chunk_mlp
+from .fusedqkv_utils import require_tp_fused_qkvw
 from deepspeed.module_inject.tp_shard import get_shard_size, get_shard_size_list
+from deepspeed.utils import groups
+from deepspeed.module_inject.layers import is_autotp_training_mode
 
 
 def move(tensor, device):
@@ -324,10 +326,18 @@ def tp_parser(model):
         return policy_list
 
     def set_tensor_parallel_config(self, mp_size, mp_group):
+
+        if is_autotp_training_mode():
+            self.mp_group = groups.get_tensor_model_parallel_group()
+            self.mp_size = groups.get_tensor_model_parallel_world_size()
+            return
+
         self.mp_size = mp_size
         self.mp_group = mp_group
 
     def _replace(self, child, name, conv_linear_layer):
+        # This function should clearly define the routing rules for specific layers
+        # and avoid any complex shard-related logic.
         if getattr(child, "replaced", False) == True:
             return
         weight_shape = child.weight.shape
@@ -339,77 +349,37 @@ def _replace(self, child, name, conv_linear_layer):
         # For Yuan model
         if 'Yuan' in str(self.module):
             if 'v_proj' in name:
-                weight, bias = shard_value_with_share_qk(child.weight.data, child.bias, dist.get_rank(),
-                                                         dist.get_world_size(), True)
-                return LinearLayer(weight=weight, bias=bias)
+                return Yuan_LinearLayer(child, self.mp_group)
+
             elif 'o_proj' in name:
-                weight, bias = shard_value_with_share_qk(child.weight.data, child.bias, dist.get_rank(),
-                                                         dist.get_world_size(), False)
-                return LinearAllreduce(weight, bias, self.mp_group)
-        # For Arctic model, bypass to all_reduce replacement for w2 weights
+                return Yuan_LinearALlreduce(child, self.mp_group)
+
+        # For MLP including chunk layer.
+        if 'gate_up_proj' in name or ('dense_h_to_4h' in name and 'GLM' in str(self.module)):
+            return GLM_LinearLayer(child, self.mp_group)
+            # For Arctic model, bypass to all_reduce replacement for w2 weights
         arctic_w2_all_reduce_linear = False
         if 'Arctic' in str(self.module) and 'w2' in name:
             arctic_w2_all_reduce_linear = True
-        # For MLP including chunk layer.
-        if 'gate_up_proj' in name or ('dense_h_to_4h' in name and 'GLM' in str(self.module)):
-            weight, bias = shard_chunk_mlp(child.weight.data, child.bias, dist.get_rank(), dist.get_world_size())
-            return LinearLayer(weight=weight, bias=bias)
         if name in self.all_reduce_linears or arctic_w2_all_reduce_linear:
-            # if conv_linear_layer [weight_shape[1], weight_shape[0] // mp_size]
-            # else [weight_shape[0], weight_shape[1] // mp_size]
 
+            setattr(child, "replaced", True)
             if self.conv_linear_layer:
-                child.weight.data = child.weight.data.transpose(-1, -2).contiguous()
-            data = child.weight.data.split(get_shard_size_list(
-                weight_shape[0] if self.conv_linear_layer else weight_shape[1], self.mp_size, name),
-                                           dim=1)
-            data_dc = move(data[mp_replace.gpu_index], get_accelerator().current_device_name()).detach()
-            del data
+                return Conv_LinearALlreduce(child, self.mp_group, name=name)
+            elif name == "lm_head" or name == 'embed_out':
+                return LmHeadLinearAllreduce(child, self.mp_group)
 
-            setattr(child, "replaced", True)
-            if name == "lm_head" or name == 'embed_out':
-                return LmHeadLinearAllreduce(
-                    torch.nn.parameter.Parameter(data_dc, requires_grad=False), dist.get_rank(), dist.get_world_size(),
-                    child.bias if child.bias is None else torch.nn.parameter.Parameter(
-                        move(child.bias,
-                             get_accelerator().current_device_name())), self.mp_group)
-            return LinearAllreduce(torch.nn.parameter.Parameter(data_dc, requires_grad=False), child.bias if child.bias is None else \
-                        torch.nn.parameter.Parameter(move(child.bias, get_accelerator().current_device_name())), self.mp_group)
+            return LinearAllreduce(child, self.mp_group, name=name)
         else:
 
-            # if conv_linear_layer [weight_shape[1], weight_shape[0] // mp_size]
-            # else [weight_shape[0] // mp_size, weight_shape[1]]
+            setattr(child, "replaced", True)
             if self.conv_linear_layer:
-                child.weight.data = child.weight.data.transpose(-1, -2).contiguous()
-
-            if require_tp_fused_qkvw(name, self.mp_size):
+                conv_LinearLayer(child, self.mp_group)
+            elif require_tp_fused_qkvw(name, self.mp_size):
                 #Check and handle fused qkv for TP
-                #The copy is a regular copy, The shape of dst and src is the same
-                data_dc = move(
-                    prepare_tp_fused_qkvw(self.module, child.weight.data, self.mp_size, mp_replace.gpu_index),
-                    get_accelerator().current_device_name())
-
-                bias_data_dc = None if child.bias is None else move(
-                    prepare_tp_fused_qkvw(self.module, child.bias.data, self.mp_size, mp_replace.gpu_index),
-                    get_accelerator().current_device_name())
-            else:
-                data = child.weight.data.split(get_shard_size_list(weight_shape[0], self.mp_size, name),
-                                               dim=1 if self.conv_linear_layer else 0)
-                data_dc = move(data[mp_replace.gpu_index], get_accelerator().current_device_name()).detach()
-                del data
-
-                if child.bias is not None:
-                    bias_data = child.bias.data.split(get_shard_size_list(
-                        weight_shape[1] if self.conv_linear_layer else weight_shape[0], self.mp_size, name),
-                                                      dim=0)
-                    bias_data = move(bias_data[mp_replace.gpu_index], get_accelerator().current_device_name())
-                    bias_data_dc = torch.nn.parameter.Parameter(bias_data, requires_grad=False)
-                    del bias_data
-                else:
-                    bias_data_dc = None
+                return fused_LinearLayer(child, self.mp_group, fused_module=self.module)
 
-            setattr(child, "replaced", True)
-            return LinearLayer(weight=torch.nn.parameter.Parameter(data_dc, requires_grad=False), bias=bias_data_dc)
+            return LinearLayer(child, self.mp_group, name=name)
 
     def _slice_embedding(self, child, name, conv_linear_layer):
         if getattr(child, "replaced", False) == True: