diff --git a/.gitignore b/.gitignore index bf707424..40e0026e 100644 --- a/.gitignore +++ b/.gitignore @@ -23,4 +23,4 @@ save* *.pid *.ipynb* .venv/ -*.sh \ No newline at end of file +*.sh diff --git a/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml b/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml index eb08a4ec..d9122b81 100644 --- a/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml +++ b/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml @@ -44,4 +44,4 @@ save: save_lightllm_kv_calib: True lightllm_kv_cache_name: kv_cache_calib.json save_fake: False - save_path: /path/to/save/ \ No newline at end of file + save_path: /path/to/save/ diff --git a/llmc/__main__.py b/llmc/__main__.py index 47f9d6ad..e9ac91ba 100755 --- a/llmc/__main__.py +++ b/llmc/__main__.py @@ -18,9 +18,9 @@ from llmc.data import BaseDataset from llmc.eval.utils import eval_model, get_eval_list from llmc.models import * -from llmc.utils import (check_config, deploy_all_modality, get_modality, - mkdirs, print_important_package_version, seed_all, - collect_lightllm_kv_calib_json, +from llmc.utils import (check_config, collect_lightllm_kv_calib_json, + deploy_all_modality, get_modality, mkdirs, + print_important_package_version, seed_all, update_autoawq_quant_config, update_lightx2v_quant_config, update_vllm_quant_config) from llmc.utils.registry_factory import ALGO_REGISTRY, MODEL_REGISTRY @@ -290,4 +290,3 @@ def main(config): llmc_duration_time = llmc_end_time - llmc_start_time logger.info(f'llmc_duration_time: {llmc_duration_time} s') logger.info('--- llmc finished ---') - diff --git a/llmc/compression/quantization/kvquant.py b/llmc/compression/quantization/kvquant.py index 29622297..b11a11b9 100644 --- a/llmc/compression/quantization/kvquant.py +++ b/llmc/compression/quantization/kvquant.py @@ -1,4 +1,5 @@ import copy + import torch from loguru import logger from transformers import DynamicCache @@ -13,7 +14,8 @@ class NaiveQuantKVCache(DynamicCache): def __init__(self, quant_type, kvquant_cfg, num_hidden_layers, num_samples=128, bsz=1): super().__init__() - # Copy the config to avoid mutating the original quantization config in static KV calibration. + # Copy the config to avoid mutating the original quantization + # config in static KV calibration. kvquant_cfg = copy.deepcopy(kvquant_cfg) assert kvquant_cfg.granularity in ['per_token', 'per_tensor', 'per_group', 'per_head'] self.num_hidden_layers, self.num_samples, self.bsz = ( diff --git a/llmc/models/mixtral.py b/llmc/models/mixtral.py index 0ca40671..aa0d5487 100644 --- a/llmc/models/mixtral.py +++ b/llmc/models/mixtral.py @@ -59,7 +59,8 @@ def get_subsets_in_block(self, block): return self._get_subsets_fused(block) def _get_subsets_legacy(self, block): - """transformers <5.0: block.block_sparse_moe with ModuleList experts.""" + """Transformers <5.0: block.block_sparse_moe with ModuleList + experts.""" moe = block.block_sparse_moe return [ { @@ -106,7 +107,7 @@ def _get_subsets_legacy(self, block): ] def _get_subsets_fused(self, block): - """transformers >=5.0: block.mlp with fused MixtralExperts.""" + """Transformers >=5.0: block.mlp with fused MixtralExperts.""" moe = block.mlp return [ {