PaddlePaddle · Wanglongzhi2001 · Oct 29, 2024 · Dec 17, 2024 · Dec 17, 2024 · Dec 17, 2024
diff --git a/llm/server/server/engine/infer.py b/llm/server/server/engine/infer.py
@@ -26,9 +26,10 @@
 import paddle.distributed as dist
 import paddle.distributed.fleet as fleet
 from paddlenlp.trl.llm_utils import get_rotary_position_embedding
-from paddlenlp_ops import step_paddle
+from paddlenlp_ops import step_paddle, speculate_step_paddle
 from server.data.processor import DataProcessor
 from server.engine.config import Config
+from server.engine.proposers import InferenceWithReferenceProposer
 from server.utils import get_logger
 from task_queue_manager import TaskQueueManager
 
@@ -46,6 +47,7 @@ def __init__(self, args):
 
         self.config = Config()
         self.model_cfg = self.config.get_model_config()
+        self.is_speculate_decoding = self.model_cfg.get("speculate_method") is not None
         self.format_print_configuration()
 
         self.args.num_layers = self.get_value(self.model_cfg, ["num_hidden_layers", "num_layers"])
@@ -62,6 +64,19 @@ def __init__(self, args):
         self.cache_kvs = {}
         self.init_inputs()
 
+        if self.is_speculate_decoding:
+            logger.info(f'Using speculating decoding, method: {self.model_cfg["speculate_method"]}.')
+            if self.model_cfg["speculate_method"] == "inference_with_reference":
+                self.proposer = InferenceWithReferenceProposer(
+                    self.model_cfg["speculate_max_draft_token_num"],
+                    self.model_cfg["speculate_max_ngram_size"],
+                    self.args.max_batch_size,
+                    self.args.max_seq_len)
+            else:
+                raise NotImplementedError(f'Not support {self.model_cfg["speculate_method"]}, only support inference_with_reference now.')
+        else:
+            self.proposer = None
+
         self.infer_queue = TaskQueueManager(rank=self.rank, mp_num=self.nranks, port=self.config.infer_port)
 
         model_rank_path = os.path.join(self.args.model_dir, f"rank_{self.rank}")
@@ -246,6 +261,19 @@ def init_inputs(self):
         self.share_inputs['free_list_len'] = paddle.full(
                             shape=[1], fill_value=self.free_list_len, dtype="int32")
 
+        # speculate decoding input
+        if self.is_speculate_decoding:
+            self.share_inputs["accept_tokens"] = paddle.full(
+                shape=[self.args.max_batch_size, self.model_cfg["speculate_max_draft_token_num"] + 1], fill_value=0, dtype="int64"
+            )
+            self.share_inputs["accept_num"] = paddle.full(shape=[self.args.max_batch_size], fill_value=0, dtype="int32")
+            self.share_inputs["draft_tokens"] = paddle.full(
+                shape=[self.args.max_batch_size, self.model_cfg["speculate_max_draft_token_num"] + 1], fill_value=0, dtype="int64"
+            )
+            self.share_inputs["actual_draft_token_num"] = paddle.full(
+                shape=[self.args.max_batch_size], fill_value=self.model_cfg["speculate_max_draft_token_num"], dtype="int32"
+            )
+
     def dy_input_preprocess(self, tasks):
         """
         dynamic insertion
@@ -288,23 +316,43 @@ def dy_input_preprocess(self, tasks):
             self.share_inputs["block_tables"][idx:idx + 1, :encoder_block_num] = np.array(
                                             task['block_tables'], dtype="int32")
 
+            if self.is_speculate_decoding:
+                self.share_inputs["draft_tokens"][idx:idx + 1] = np.zeros([self.model_cfg["speculate_max_draft_token_num"] + 1])
+                self.share_inputs["actual_draft_token_num"][idx:idx + 1] = np.array([self.model_cfg["speculate_max_draft_token_num"]])
+
     def step_cuda(self, seq_lens_this_time):
         """
         step cuda
         """
-        step_paddle(self.share_inputs['stop_flags'], seq_lens_this_time,
-                    self.share_inputs['step_seq_lens_encoder'],
-                    self.share_inputs['seq_lens_encoder'],
-                    self.share_inputs['seq_lens_decoder'], self.share_inputs["block_tables"],
-                    self.share_inputs['encoder_block_lens'],
-                    self.share_inputs["is_block_step"], self.share_inputs['step_block_list'],
-                    self.share_inputs['step_lens'], self.share_inputs['recover_block_list'],
-                    self.share_inputs['recover_lens'], self.share_inputs['need_block_list'],
-                    self.share_inputs['need_block_len'], self.share_inputs['used_list_len'],
-                    self.share_inputs['free_list'], self.share_inputs['free_list_len'],
-                    self.share_inputs['input_ids'], self.share_inputs['pre_ids'],
-                    self.share_inputs['step_idx'], self.share_inputs['next_tokens'],
-                    self.args.block_size, self.args.enc_dec_block_num, self.args.first_token_id)
+        if not self.is_speculate_decoding:
+            step_paddle(self.share_inputs['stop_flags'], seq_lens_this_time,
+                        self.share_inputs['step_seq_lens_encoder'],
+                        self.share_inputs['seq_lens_encoder'],
+                        self.share_inputs['seq_lens_decoder'], self.share_inputs["block_tables"],
+                        self.share_inputs['encoder_block_lens'],
+                        self.share_inputs["is_block_step"], self.share_inputs['step_block_list'],
+                        self.share_inputs['step_lens'], self.share_inputs['recover_block_list'],
+                        self.share_inputs['recover_lens'], self.share_inputs['need_block_list'],
+                        self.share_inputs['need_block_len'], self.share_inputs['used_list_len'],
+                        self.share_inputs['free_list'], self.share_inputs['free_list_len'],
+                        self.share_inputs['input_ids'], self.share_inputs['pre_ids'],
+                        self.share_inputs['step_idx'], self.share_inputs['next_tokens'],
+                        self.args.block_size, self.args.enc_dec_block_num, self.args.first_token_id)
+        else:
+            speculate_step_paddle(self.share_inputs['stop_flags'], seq_lens_this_time,
+                                  self.share_inputs['step_seq_lens_encoder'],
+                                  self.share_inputs['seq_lens_encoder'],
+                                  self.share_inputs['seq_lens_decoder'], self.share_inputs["block_tables"],
+                                  self.share_inputs['encoder_block_lens'],
+                                  self.share_inputs["is_block_step"], self.share_inputs['step_block_list'],
+                                  self.share_inputs['step_lens'], self.share_inputs['recover_block_list'],
+                                  self.share_inputs['recover_lens'], self.share_inputs['need_block_list'],
+                                  self.share_inputs['need_block_len'], self.share_inputs['used_list_len'],
+                                  self.share_inputs['free_list'], self.share_inputs['free_list_len'],
+                                  self.share_inputs['input_ids'], self.share_inputs['pre_ids'],
+                                  self.share_inputs['step_idx'], self.share_inputs['next_tokens'],
+                                  self.args.block_size, self.args.enc_dec_block_num, self.args.first_token_id,
+                                  self.model_cfg["speculate_max_draft_token_num"])
 
     def initialize_engine_ready_check_flag(self):
         """
@@ -429,6 +477,13 @@ def run(self):
                 time.sleep(0.001)
                 continue
 
+            if self.proposer is not None:
+                self.proposer.run(
+                    self.share_inputs,
+                    real_batch_size=self.args.max_batch_size,
+                    seq_lens_this_time=self.share_inputs["seq_lens_this_time"],
+                )
+
             self.infer_engine.predictor.run()
             self.share_inputs['infer_seed'].add_(infer_seed_increment)
             self.share_inputs['infer_seed'][:] %= self.MAX_INFER_SEED

diff --git a/llm/server/server/engine/proposers.py b/llm/server/server/engine/proposers.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+
+import paddle
+
+
+class Proposer(ABC):
+    """
+    Abstract base class for all proposers that can be used in the speculative decoding framework.
+    The subclasses of this class must implement the run method to get the draft tokens that are
+    generated by the proposer.
+    """
+
+    def __init__(self, **kwargs):
+        pass
+
+    @abstractmethod
+    def run(self, model_inputs: dict[str, paddle.Tensor], **kargs):
+        """
+        Get the draft tokens that are generated by the proposer.
+        """
+        raise NotImplementedError()
+
+
+class InferenceWithReferenceProposer(Proposer):
+    """
+    InferenceWithReference(https://arxiv.org/pdf/2304.04487) is one of the speculative decoding method.
+    It match tokens in the input and output as draft tokens.
+    """
+
+    def __init__(self, max_draft_token_num: int, max_ngram_size: int, max_batch_size: int, max_seq_len: int, **kwargs):
+        """
+        Args:
+        max_draft_token_num (int):
+            Maximum number of tokens a proposer can generate at one time.
+            The hyperparameter of k in the paper.
+        max_ngram_size (int):
+            The maximum size of the window used to match inputs and outputs.
+            The hyperparameter of n in the paper.
+        max_batch_size (int):
+            The maximum batch size.
+        max_seq_len (int):
+            The maximum sequence length.
+        """
+        super().__init__()
+        self.max_ngram_size = max_ngram_size
+        self.input_ids_len = paddle.zeros(shape=[max_batch_size, 1], dtype="int64").cpu()
+        self.input_ids_cpu = paddle.zeros(shape=[max_batch_size, max_seq_len], dtype="int64").cpu()
+        self.max_batch_size = max_batch_size
+        self.max_draft_token_num = max_draft_token_num
+
+    def run(self, model_inputs: dict[str, paddle.Tensor], **kargs):
+        """
+        Use ngram_match to get draft tokens from the input and output.
+        """
+        draft_tokens = model_inputs["draft_tokens"].cpu()
+        seq_lens_this_time = kargs["seq_lens_this_time"].cpu()
+        seq_lens_encoder = model_inputs["seq_lens_encoder"].cpu()
+        seq_lens_decoder = model_inputs["seq_lens_decoder"].cpu()
+
+        from paddlenlp_ops import ngram_match
+
+        ngram_match(
+            self.input_ids_cpu,
+            self.input_ids_len.cpu(),
+            model_inputs["pre_ids"].cpu(),
+            model_inputs["step_idx"].cpu(),
+            model_inputs["actual_draft_token_num"].cpu(),
+            draft_tokens,
+            seq_lens_this_time,
+            seq_lens_encoder,
+            seq_lens_decoder,
+            kargs["real_batch_size"],
+            self.max_ngram_size,
+            self.max_draft_token_num,
+        )
+
+        model_inputs["draft_tokens"][:] = draft_tokens.cuda()
+        model_inputs["seq_lens_encoder"][:] = seq_lens_encoder.cuda()
+        kargs["seq_lens_this_time"][:] = seq_lens_this_time.cuda()