时空ver最后的回忆
This commit is contained in:
728
mypaper/AAAI2026_RoSA.bib
Executable file
728
mypaper/AAAI2026_RoSA.bib
Executable file
@@ -0,0 +1,728 @@
|
||||
% AAAING
|
||||
|
||||
% Datasets
|
||||
% GSM8K
|
||||
@article{cobbe2021training,
|
||||
title={Training verifiers to solve math word problems},
|
||||
author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and others},
|
||||
journal={arXiv preprint arXiv:2110.14168},
|
||||
year={2021}
|
||||
}
|
||||
% SVAMP
|
||||
@article{patel2021nlp,
|
||||
title={Are NLP models really able to solve simple math word problems?},
|
||||
author={Patel, Arkil and Bhattamishra, Satwik and Goyal, Navin},
|
||||
journal={arXiv preprint arXiv:2103.07191},
|
||||
year={2021}
|
||||
}
|
||||
% MultiArith
|
||||
@article{roy2016solving,
|
||||
title={Solving general arithmetic word problems},
|
||||
author={Roy, Subhro and Roth, Dan},
|
||||
journal={arXiv preprint arXiv:1608.01413},
|
||||
year={2016}
|
||||
}
|
||||
% Addsub
|
||||
@inproceedings{hosseini2014learning,
|
||||
title={Learning to solve arithmetic word problems with verb categorization},
|
||||
author={Hosseini, Mohammad Javad and Hajishirzi, Hannaneh and Etzioni, Oren and Kushman, Nate},
|
||||
booktitle={Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP)},
|
||||
pages={523--533},
|
||||
year={2014}
|
||||
}
|
||||
% AQuA
|
||||
@article{ling2017program,
|
||||
title={Program induction by rationale generation: Learning to solve and explain algebraic word problems},
|
||||
author={Ling, Wang and Yogatama, Dani and Dyer, Chris and Blunsom, Phil},
|
||||
journal={arXiv preprint arXiv:1705.04146},
|
||||
year={2017}
|
||||
}
|
||||
% SingleEq
|
||||
@article{koncel2015parsing,
|
||||
title={Parsing algebraic word problems into equations},
|
||||
author={Koncel-Kedziorski, Rik and Hajishirzi, Hannaneh and Sabharwal, Ashish and Etzioni, Oren and Ang, Siena Dumas},
|
||||
journal={Transactions of the Association for Computational Linguistics},
|
||||
volume={3},
|
||||
pages={585--597},
|
||||
year={2015},
|
||||
publisher={MIT Press One Rogers Street, Cambridge, MA 02142-1209, USA journals-info~…}
|
||||
}
|
||||
% MAWPS
|
||||
@inproceedings{koncel2016mawps,
|
||||
title={MAWPS: A math word problem repository},
|
||||
author={Koncel-Kedziorski, Rik and Roy, Subhro and Amini, Aida and Kushman, Nate and Hajishirzi, Hannaneh},
|
||||
booktitle={Proceedings of the 2016 conference of the north american chapter of the association for computational linguistics: human language technologies},
|
||||
pages={1152--1157},
|
||||
year={2016}
|
||||
}
|
||||
% BoolQ
|
||||
@article{clark2019boolq,
|
||||
title={Boolq: Exploring the surprising difficulty of natural yes/no questions},
|
||||
author={Clark, Christopher and Lee, Kenton and Chang, Ming-Wei and Kwiatkowski, Tom and Collins, Michael and Toutanova, Kristina},
|
||||
journal={arXiv preprint arXiv:1905.10044},
|
||||
year={2019}
|
||||
}
|
||||
% PIQA
|
||||
@inproceedings{bisk2020piqa,
|
||||
title={Piqa: Reasoning about physical commonsense in natural language},
|
||||
author={Bisk, Yonatan and Zellers, Rowan and Gao, Jianfeng and Choi, Yejin and others},
|
||||
booktitle={Proceedings of the AAAI conference on artificial intelligence},
|
||||
volume={34},
|
||||
number={05},
|
||||
pages={7432--7439},
|
||||
year={2020}
|
||||
}
|
||||
% SIQA
|
||||
@article{sap2019socialiqa,
|
||||
title={Socialiqa: Commonsense reasoning about social interactions},
|
||||
author={Sap, Maarten and Rashkin, Hannah and Chen, Derek and LeBras, Ronan and Choi, Yejin},
|
||||
journal={arXiv preprint arXiv:1904.09728},
|
||||
year={2019}
|
||||
}
|
||||
% HW
|
||||
@article{zellers2019hellaswag,
|
||||
title={Hellaswag: Can a machine really finish your sentence?},
|
||||
author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin},
|
||||
journal={arXiv preprint arXiv:1905.07830},
|
||||
year={2019}
|
||||
}
|
||||
% WN
|
||||
@inproceedings{sakaguchi2020winogrande,
|
||||
title={Winogrande: An adversarial winograd schema challenge at scale},
|
||||
author={Sakaguchi, Keisuke and Le Bras, Ronan and Bhagavatula, Chandra and Choi, Yejin},
|
||||
booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
|
||||
volume={34},
|
||||
number={05},
|
||||
pages={8732--8740},
|
||||
year={2020}
|
||||
}
|
||||
% ARC
|
||||
@article{clark2018think,
|
||||
title={Think you have solved question answering? try arc, the ai2 reasoning challenge},
|
||||
author={Clark, Peter and Cowhey, Isaac and Etzioni, Oren and Khot, Tushar and Sabharwal, Ashish and Schoenick, Carissa and Tafjord, Oyvind},
|
||||
journal={arXiv preprint arXiv:1803.05457},
|
||||
year={2018}
|
||||
}
|
||||
% OBDA
|
||||
@article{mihaylov2018can,
|
||||
title={Can a suit of armor conduct electricity? a new dataset for open book question answering},
|
||||
author={Mihaylov, Todor and Clark, Peter and Khot, Tushar and Sabharwal, Ashish},
|
||||
journal={arXiv preprint arXiv:1809.02789},
|
||||
year={2018}
|
||||
}
|
||||
|
||||
% Related
|
||||
@inproceedings{houlsby2019parameter,
|
||||
title={Parameter-efficient transfer learning for NLP},
|
||||
author={Houlsby, Neil and Giurgiu, Andrei and Jastrzebski, Stanislaw and Morrone, Bruna and De Laroussilhe, Quentin and Gesmundo, Andrea and Attariyan, Mona and Gelly, Sylvain},
|
||||
booktitle={International conference on machine learning},
|
||||
pages={2790--2799},
|
||||
year={2019},
|
||||
organization={PMLR}
|
||||
}
|
||||
@article{li2021prefix,
|
||||
title={Prefix-tuning: Optimizing continuous prompts for generation},
|
||||
author={Li, Xiang Lisa and Liang, Percy},
|
||||
journal={arXiv preprint arXiv:2101.00190},
|
||||
year={2021}
|
||||
}
|
||||
@article{dong2025attention,
|
||||
title={Attention Retrieves, MLP Memorizes: Disentangling Trainable Components in the Transformer},
|
||||
author={Dong, Yihe and Noci, Lorenzo and Khodak, Mikhail and Li, Mufan},
|
||||
journal={arXiv preprint arXiv:2506.01115},
|
||||
year={2025}
|
||||
}
|
||||
@article{michel2019sixteen,
|
||||
title={Are sixteen heads really better than one?},
|
||||
author={Michel, Paul and Levy, Omer and Neubig, Graham},
|
||||
journal={Advances in neural information processing systems},
|
||||
volume={32},
|
||||
year={2019}
|
||||
}
|
||||
@article{belinkov2018evaluating,
|
||||
title={Evaluating layers of representation in neural machine translation on part-of-speech and semantic tagging tasks},
|
||||
author={Belinkov, Yonatan and M{\`a}rquez, Llu{\'\i}s and Sajjad, Hassan and Durrani, Nadir and Dalvi, Fahim and Glass, James},
|
||||
journal={arXiv preprint arXiv:1801.07772},
|
||||
year={2018}
|
||||
}
|
||||
% Others
|
||||
@article{ding2023parameter,
|
||||
title={Parameter-efficient fine-tuning of large-scale pre-trained language models},
|
||||
author={Ding, Ning and Qin, Yujia and Yang, Guang and Wei, Fuchao and Yang, Zonghan and Su, Yusheng and Hu, Shengding and Chen, Yulin and Chan, Chi-Min and Chen, Weize and others},
|
||||
journal={Nature machine intelligence},
|
||||
volume={5},
|
||||
number={3},
|
||||
pages={220--235},
|
||||
year={2023},
|
||||
publisher={Nature Publishing Group UK London}
|
||||
}
|
||||
@article{peng2023instruction,
|
||||
title={Instruction tuning with gpt-4},
|
||||
author={Peng, Baolin and Li, Chunyuan and He, Pengcheng and Galley, Michel and Gao, Jianfeng},
|
||||
journal={arXiv preprint arXiv:2304.03277},
|
||||
year={2023}
|
||||
}
|
||||
|
||||
% Baselines
|
||||
@article{liu2024dora,
|
||||
title={Dora: Weight-decomposed low-rank adaptation},
|
||||
author={Liu, Shih-Yang and Wang, Chien-Yi and Yin, Hongxu and Molchanov, Pavlo and Wang, Yu-Chiang Frank and Cheng, Kwang-Ting and Chen, Min-Hung},
|
||||
journal={arXiv preprint arXiv:2402.09353},
|
||||
year={2024}
|
||||
}
|
||||
@article{hu2021lora,
|
||||
title={Lora: Low-rank adaptation of large language models},
|
||||
author={Hu, Edward J and Shen, Yelong and Wallis, Phillip and Allen-Zhu, Zeyuan and Li, Yuanzhi and Wang, Shean and Wang, Lu and Chen, Weizhu},
|
||||
journal={arXiv preprint arXiv:2106.09685},
|
||||
year={2021}
|
||||
}
|
||||
@article{zhang2023adalora,
|
||||
title={Adalora: Adaptive budget allocation for parameter-efficient fine-tuning},
|
||||
author={Zhang, Qingru and Chen, Minshuo and Bukharin, Alexander and Karampatziakis, Nikos and He, Pengcheng and Cheng, Yu and Chen, Weizhu and Zhao, Tuo},
|
||||
journal={arXiv preprint arXiv:2303.10512},
|
||||
year={2023}
|
||||
}
|
||||
% C3A
|
||||
@article{chen2024parameter,
|
||||
title={Parameter-efficient fine-tuning via circular convolution},
|
||||
author={Chen, Aochuan and Cheng, Jiashun and Liu, Zijing and Gao, Ziqi and Tsung, Fugee and Li, Yu and Li, Jia},
|
||||
journal={arXiv preprint arXiv:2407.19342},
|
||||
year={2024}
|
||||
}
|
||||
% BONE
|
||||
@article{kang2024balancing,
|
||||
title={Balancing LoRA Performance and Efficiency with Simple Shard Sharing},
|
||||
author={Kang, Jiale and Yin, Qingyu},
|
||||
journal={arXiv preprint arXiv:2409.15371},
|
||||
year={2024}
|
||||
}
|
||||
% VERA-EDITED
|
||||
@article{kopiczko2023vera,
|
||||
title={Vera: Vector-based random matrix adaptation},
|
||||
author={{Kopiczko et al.}},
|
||||
journal={arXiv preprint arXiv:2310.11454},
|
||||
year={2023}
|
||||
}
|
||||
% BOFT
|
||||
@article{liu2023parameter,
|
||||
title={Parameter-efficient orthogonal finetuning via butterfly factorization},
|
||||
author={Liu, Weiyang and Qiu, Zeju and Feng, Yao and Xiu, Yuliang and Xue, Yuxuan and Yu, Longhui and Feng, Haiwen and Liu, Zhen and Heo, Juyeon and Peng, Songyou and others},
|
||||
journal={arXiv preprint arXiv:2311.06243},
|
||||
year={2023}
|
||||
}
|
||||
% LN-Tuning
|
||||
@article{zhao2023tuning,
|
||||
title={Tuning layernorm in attention: Towards efficient multi-modal llm finetuning},
|
||||
author={Zhao, Bingchen and Tu, Haoqin and Wei, Chen and Mei, Jieru and Xie, Cihang},
|
||||
journal={arXiv preprint arXiv:2312.11420},
|
||||
year={2023}
|
||||
}
|
||||
|
||||
% Deepspeed
|
||||
@inproceedings{rasley2020deepspeed,
|
||||
title={Deepspeed: System optimizations enable training deep learning models with over 100 billion parameters},
|
||||
author={Rasley, Jeff and Rajbhandari, Samyam and Ruwase, Olatunji and He, Yuxiong},
|
||||
booktitle={Proceedings of the 26th ACM SIGKDD international conference on knowledge discovery \& data mining},
|
||||
pages={3505--3506},
|
||||
year={2020}
|
||||
}
|
||||
% Huggingface Transformers
|
||||
@inproceedings{wolf2020transformers,
|
||||
title={Transformers: State-of-the-art natural language processing},
|
||||
author={Wolf, Thomas and Debut, Lysandre and Sanh, Victor and Chaumond, Julien and Delangue, Clement and Moi, Anthony and Cistac, Pierric and Rault, Tim and Louf, Remi and Funtowicz, Morgan and others},
|
||||
booktitle={Proceedings of the 2020 conference on empirical methods in natural language processing: system demonstrations},
|
||||
pages={38--45},
|
||||
year={2020}
|
||||
}
|
||||
|
||||
@inproceedings{geva2021transformer,
|
||||
title={Transformer Feed-Forward Layers Are Key-Value Memories},
|
||||
author={Geva, Mor and Schuster, Roei and Berant, Jonathan and Levy, Omer},
|
||||
booktitle={Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing},
|
||||
pages={5484--5495},
|
||||
year={2021}
|
||||
}
|
||||
|
||||
@article{su2024roformer,
|
||||
title={Roformer: Enhanced transformer with rotary position embedding},
|
||||
author={Su, Jianlin and Ahmed, Murtadha and Lu, Yu and Pan, Shengfeng and Bo, Wen and Liu, Yunfeng},
|
||||
journal={Neurocomputing},
|
||||
volume={568},
|
||||
pages={127063},
|
||||
year={2024},
|
||||
publisher={Elsevier}
|
||||
}
|
||||
@article{barbero2024round,
|
||||
title={Round and round we go! what makes rotary positional encodings useful?},
|
||||
author={Barbero, Federico and Vitvitskyi, Alex and Perivolaropoulos, Christos and Pascanu, Razvan and Veli{\v{c}}kovi{\'c}, Petar},
|
||||
journal={arXiv preprint arXiv:2410.06205},
|
||||
year={2024}
|
||||
}
|
||||
|
||||
@article{jin2025massive,
|
||||
title={Massive Values in Self-Attention Modules are the Key to Contextual Knowledge Understanding},
|
||||
author={Jin, Mingyu and Mei, Kai and Xu, Wujiang and Sun, Mingjie and Tang, Ruixiang and Du, Mengnan and Liu, Zirui and Zhang, Yongfeng},
|
||||
journal={arXiv preprint arXiv:2502.01563},
|
||||
year={2025}
|
||||
}
|
||||
@article{vaswani2017attention,
|
||||
title={Attention is all you need},
|
||||
author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
|
||||
journal={Advances in neural information processing systems},
|
||||
volume={30},
|
||||
year={2017}
|
||||
}
|
||||
@article{touvron2023llama,
|
||||
title={Llama: Open and efficient foundation language models},
|
||||
author={Touvron, Hugo and Lavril, Thibaut and Izacard, Gautier and Martinet, Xavier and Lachaux, Marie-Anne and Lacroix, Timoth{\'e}e and Rozi{\`e}re, Baptiste and Goyal, Naman and Hambro, Eric and Azhar, Faisal and others},
|
||||
journal={arXiv preprint arXiv:2302.13971},
|
||||
year={2023}
|
||||
}
|
||||
@article{shazeer2020glu,
|
||||
title={Glu variants improve transformer},
|
||||
author={Shazeer, Noam},
|
||||
journal={arXiv preprint arXiv:2002.05202},
|
||||
year={2020}
|
||||
}
|
||||
@inproceedings{he2016deep,
|
||||
title={Deep residual learning for image recognition},
|
||||
author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
|
||||
booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
|
||||
pages={770--778},
|
||||
year={2016}
|
||||
}
|
||||
@article{bai2023qwen,
|
||||
title={Qwen technical report},
|
||||
author={Bai, Jinze and Bai, Shuai and Chu, Yunfei and Cui, Zeyu and Dang, Kai and Deng, Xiaodong and Fan, Yang and Ge, Wenbin and Han, Yu and Huang, Fei and others},
|
||||
journal={arXiv preprint arXiv:2309.16609},
|
||||
year={2023}
|
||||
}
|
||||
|
||||
% SiLU
|
||||
@article{elfwing2018sigmoid,
|
||||
title={Sigmoid-weighted linear units for neural network function approximation in reinforcement learning},
|
||||
author={Elfwing, Stefan and Uchibe, Eiji and Doya, Kenji},
|
||||
journal={Neural networks},
|
||||
volume={107},
|
||||
pages={3--11},
|
||||
year={2018},
|
||||
publisher={Elsevier}
|
||||
}
|
||||
@article{ainslie2023gqa,
|
||||
title={Gqa: Training generalized multi-query transformer models from multi-head checkpoints},
|
||||
author={Ainslie, Joshua and Lee-Thorp, James and De Jong, Michiel and Zemlyanskiy, Yury and Lebr{\'o}n, Federico and Sanghai, Sumit},
|
||||
journal={arXiv preprint arXiv:2305.13245},
|
||||
year={2023}
|
||||
}
|
||||
@article{voita2019bottom,
|
||||
title={The bottom-up evolution of representations in the transformer: A study with machine translation and language modeling objectives},
|
||||
author={Voita, Elena and Sennrich, Rico and Titov, Ivan},
|
||||
journal={arXiv preprint arXiv:1909.01380},
|
||||
year={2019}
|
||||
}
|
||||
@article{hu2023llm,
|
||||
title={Llm-adapters: An adapter family for parameter-efficient fine-tuning of large language models},
|
||||
author={Hu, Zhiqiang and Wang, Lei and Lan, Yihuai and Xu, Wanyu and Lim, Ee-Peng and Bing, Lidong and Xu, Xing and Poria, Soujanya and Lee, Roy Ka-Wei},
|
||||
journal={arXiv preprint arXiv:2304.01933},
|
||||
year={2023}
|
||||
}
|
||||
@article{team2024gemma,
|
||||
title={Gemma 2: Improving open language models at a practical size},
|
||||
author={Team, Gemma and Riviere, Morgane and Pathak, Shreya and Sessa, Pier Giuseppe and Hardin, Cassidy and Bhupatiraju, Surya and Hussenot, L{\'e}onard and Mesnard, Thomas and Shahriari, Bobak and Ram{\'e}, Alexandre and others},
|
||||
journal={arXiv preprint arXiv:2408.00118},
|
||||
year={2024}
|
||||
}
|
||||
@article{dubey2024llama,
|
||||
title={The llama 3 herd of models},
|
||||
author={Dubey, Abhimanyu and Jauhri, Abhinav and Pandey, Abhinav and Kadian, Abhishek and Al-Dahle, Ahmad and Letman, Aiesha and Mathur, Akhil and Schelten, Alan and Yang, Amy and Fan, Angela and others},
|
||||
journal={arXiv e-prints},
|
||||
pages={arXiv--2407},
|
||||
year={2024}
|
||||
}
|
||||
@article{team2024qwen2,
|
||||
title={Qwen2 technical report},
|
||||
author={Team, Qwen},
|
||||
journal={arXiv preprint arXiv:2407.10671},
|
||||
year={2024}
|
||||
}
|
||||
% Old
|
||||
|
||||
@article{sun2025stronger,
|
||||
title={A Stronger Mixture of Low-Rank Experts for Fine-Tuning Foundation Models},
|
||||
author={Sun, Mengyang and Wang, Yihao and Feng, Tao and Zhang, Dan and Zhu, Yifan and Tang, Jie},
|
||||
journal={arXiv preprint arXiv:2502.15828},
|
||||
year={2025}
|
||||
}
|
||||
@article{pfeiffer2020mad,
|
||||
title={Mad-x: An adapter-based framework for multi-task cross-lingual transfer},
|
||||
author={Pfeiffer, Jonas and Vuli{\'c}, Ivan and Gurevych, Iryna and Ruder, Sebastian},
|
||||
journal={arXiv preprint arXiv:2005.00052},
|
||||
year={2020}
|
||||
}
|
||||
@article{raffel2020exploring,
|
||||
title={Exploring the limits of transfer learning with a unified text-to-text transformer},
|
||||
author={Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J},
|
||||
journal={Journal of machine learning research},
|
||||
volume={21},
|
||||
number={140},
|
||||
pages={1--67},
|
||||
year={2020}
|
||||
}
|
||||
@article{zaken2021bitfit,
|
||||
title={Bitfit: Simple parameter-efficient fine-tuning for transformer-based masked language-models},
|
||||
author={Zaken, Elad Ben and Ravfogel, Shauli and Goldberg, Yoav},
|
||||
journal={arXiv preprint arXiv:2106.10199},
|
||||
year={2021}
|
||||
}
|
||||
@inproceedings{papineni2002bleu,
|
||||
title={Bleu: a method for automatic evaluation of machine translation},
|
||||
author={Papineni, Kishore and Roukos, Salim and Ward, Todd and Zhu, Wei-Jing},
|
||||
booktitle={Proceedings of the 40th annual meeting of the Association for Computational Linguistics},
|
||||
pages={311--318},
|
||||
year={2002}
|
||||
}
|
||||
@inproceedings{lin2004rouge,
|
||||
title={Rouge: A package for automatic evaluation of summaries},
|
||||
author={Lin, Chin-Yew},
|
||||
booktitle={Text summarization branches out},
|
||||
pages={74--81},
|
||||
year={2004}
|
||||
}
|
||||
@article{jang2016categorical,
|
||||
title={Categorical reparameterization with gumbel-softmax},
|
||||
author={Jang, Eric and Gu, Shixiang and Poole, Ben},
|
||||
journal={arXiv preprint arXiv:1611.01144},
|
||||
year={2016}
|
||||
}
|
||||
@inproceedings{he2015delving,
|
||||
title={Delving deep into rectifiers: Surpassing human-level performance on imagenet classification},
|
||||
author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
|
||||
booktitle={Proceedings of the IEEE international conference on computer vision},
|
||||
pages={1026--1034},
|
||||
year={2015}
|
||||
}
|
||||
@article{guo2025nlora,
|
||||
title={NLoRA: Nystr$\backslash$" om-Initiated Low-Rank Adaptation for Large Language Models},
|
||||
author={Guo, Chenlu and Wu, Yuan and Chang, Yi},
|
||||
journal={arXiv preprint arXiv:2502.14482},
|
||||
year={2025}
|
||||
}
|
||||
|
||||
@article{ba2016layer,
|
||||
title={Layer normalization},
|
||||
author={Ba, Jimmy Lei and Kiros, Jamie Ryan and Hinton, Geoffrey E},
|
||||
journal={arXiv preprint arXiv:1607.06450},
|
||||
year={2016}
|
||||
}
|
||||
|
||||
@article{team2023gemini,
|
||||
title={Gemini: a family of highly capable multimodal models},
|
||||
author={Team, Gemini and Anil, Rohan and Borgeaud, Sebastian and Alayrac, Jean-Baptiste and Yu, Jiahui and Soricut, Radu and Schalkwyk, Johan and Dai, Andrew M and Hauth, Anja and Millican, Katie and others},
|
||||
journal={arXiv preprint arXiv:2312.11805},
|
||||
year={2023}
|
||||
}
|
||||
@article{liu2023moelora,
|
||||
title={Moelora: An moe-based parameter efficient fine-tuning method for multi-task medical applications},
|
||||
author={Liu, Qidong and Wu, Xian and Zhao, Xiangyu and Zhu, Yuanshao and Xu, Derong and Tian, Feng and Zheng, Yefeng},
|
||||
journal={arXiv preprint arXiv:2310.18339},
|
||||
year={2023}
|
||||
}
|
||||
@article{wang2023multilora,
|
||||
title={Multilora: Democratizing lora for better multi-task learning},
|
||||
author={Wang, Yiming and Lin, Yu and Zeng, Xiaodong and Zhang, Guannan},
|
||||
journal={arXiv preprint arXiv:2311.11501},
|
||||
year={2023}
|
||||
}
|
||||
@article{liu2021p,
|
||||
title={P-tuning v2: Prompt tuning can be comparable to fine-tuning universally across scales and tasks},
|
||||
author={Liu, Xiao and Ji, Kaixuan and Fu, Yicheng and Tam, Weng Lam and Du, Zhengxiao and Yang, Zhilin and Tang, Jie},
|
||||
journal={arXiv preprint arXiv:2110.07602},
|
||||
year={2021}
|
||||
}
|
||||
@article{brown2020language,
|
||||
title={Language models are few-shot learners},
|
||||
author={Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and others},
|
||||
journal={Advances in neural information processing systems},
|
||||
volume={33},
|
||||
pages={1877--1901},
|
||||
year={2020}
|
||||
}
|
||||
@article{liu2021conflict,
|
||||
title={Conflict-averse gradient descent for multi-task learning},
|
||||
author={Liu, Bo and Liu, Xingchao and Jin, Xiaojie and Stone, Peter and Liu, Qiang},
|
||||
journal={Advances in Neural Information Processing Systems},
|
||||
volume={34},
|
||||
pages={18878--18890},
|
||||
year={2021}
|
||||
}
|
||||
@article{navon2022multi,
|
||||
title={Multi-task learning as a bargaining game},
|
||||
author={Navon, Aviv and Shamsian, Aviv and Achituve, Idan and Maron, Haggai and Kawaguchi, Kenji and Chechik, Gal and Fetaya, Ethan},
|
||||
journal={arXiv preprint arXiv:2202.01017},
|
||||
year={2022}
|
||||
}
|
||||
@article{yu2020gradient,
|
||||
title={Gradient surgery for multi-task learning},
|
||||
author={Yu, Tianhe and Kumar, Saurabh and Gupta, Abhishek and Levine, Sergey and Hausman, Karol and Finn, Chelsea},
|
||||
journal={Advances in Neural Information Processing Systems},
|
||||
volume={33},
|
||||
pages={5824--5836},
|
||||
year={2020}
|
||||
}
|
||||
@article{renduchintala2023tied,
|
||||
title={Tied-lora: Enhacing parameter efficiency of lora with weight tying},
|
||||
author={Renduchintala, Adithya and Konuk, Tugrul and Kuchaiev, Oleksii},
|
||||
journal={arXiv preprint arXiv:2311.09578},
|
||||
year={2023}
|
||||
}
|
||||
@inproceedings{kwon2023efficient,
|
||||
title={Efficient memory management for large language model serving with pagedattention},
|
||||
author={Kwon, Woosuk and Li, Zhuohan and Zhuang, Siyuan and Sheng, Ying and Zheng, Lianmin and Yu, Cody Hao and Gonzalez, Joseph and Zhang, Hao and Stoica, Ion},
|
||||
booktitle={Proceedings of the 29th Symposium on Operating Systems Principles},
|
||||
pages={611--626},
|
||||
year={2023}
|
||||
}
|
||||
|
||||
@article{dai2024deepseekmoe,
|
||||
title={Deepseekmoe: Towards ultimate expert specialization in mixture-of-experts language models},
|
||||
author={Dai, Damai and Deng, Chengqi and Zhao, Chenggang and Xu, RX and Gao, Huazuo and Chen, Deli and Li, Jiashi and Zeng, Wangding and Yu, Xingkai and Wu, Y and others},
|
||||
journal={arXiv preprint arXiv:2401.06066},
|
||||
year={2024}
|
||||
}
|
||||
@article{guo2025deepseek,
|
||||
title={Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning},
|
||||
author={Guo, Daya and Yang, Dejian and Zhang, Haowei and Song, Junxiao and Zhang, Ruoyu and Xu, Runxin and Zhu, Qihao and Ma, Shirong and Wang, Peiyi and Bi, Xiao and others},
|
||||
journal={arXiv preprint arXiv:2501.12948},
|
||||
year={2025}
|
||||
}
|
||||
@article{shazeer2017outrageously,
|
||||
title={Outrageously large neural networks: The sparsely-gated mixture-of-experts layer},
|
||||
author={Shazeer, Noam and Mirhoseini, Azalia and Maziarz, Krzysztof and Davis, Andy and Le, Quoc and Hinton, Geoffrey and Dean, Jeff},
|
||||
journal={arXiv preprint arXiv:1701.06538},
|
||||
year={2017}
|
||||
}
|
||||
@inproceedings{rajbhandari2022deepspeed,
|
||||
title={Deepspeed-moe: Advancing mixture-of-experts inference and training to power next-generation ai scale},
|
||||
author={Rajbhandari, Samyam and Li, Conglong and Yao, Zhewei and Zhang, Minjia and Aminabadi, Reza Yazdani and Awan, Ammar Ahmad and Rasley, Jeff and He, Yuxiong},
|
||||
booktitle={International conference on machine learning},
|
||||
pages={18332--18346},
|
||||
year={2022},
|
||||
organization={PMLR}
|
||||
}
|
||||
@article{zhang2023instruction,
|
||||
title={Instruction tuning for large language models: A survey},
|
||||
author={Zhang, Shengyu and Dong, Linfeng and Li, Xiaoya and Zhang, Sen and Sun, Xiaofei and Wang, Shuhe and Li, Jiwei and Hu, Runyi and Zhang, Tianwei and Wu, Fei and others},
|
||||
journal={arXiv preprint arXiv:2308.10792},
|
||||
year={2023}
|
||||
}
|
||||
@article{han2024parameter,
|
||||
title={Parameter-efficient fine-tuning for large models: A comprehensive survey},
|
||||
author={Han, Zeyu and Gao, Chao and Liu, Jinyang and Zhang, Jeff and Zhang, Sai Qian},
|
||||
journal={arXiv preprint arXiv:2403.14608},
|
||||
year={2024}
|
||||
}
|
||||
@article{pfeiffer2020adapterfusion,
|
||||
title={Adapterfusion: Non-destructive task composition for transfer learning},
|
||||
author={Pfeiffer, Jonas and Kamath, Aishwarya and R{\"u}ckl{\'e}, Andreas and Cho, Kyunghyun and Gurevych, Iryna},
|
||||
journal={arXiv preprint arXiv:2005.00247},
|
||||
year={2020}
|
||||
}
|
||||
@article{pfeiffer2020adapterhub,
|
||||
title={Adapterhub: A framework for adapting transformers},
|
||||
author={Pfeiffer, Jonas and R{\"u}ckl{\'e}, Andreas and Poth, Clifton and Kamath, Aishwarya and Vuli{\'c}, Ivan and Ruder, Sebastian and Cho, Kyunghyun and Gurevych, Iryna},
|
||||
journal={arXiv preprint arXiv:2007.07779},
|
||||
year={2020}
|
||||
}
|
||||
@article{lialin2023scaling,
|
||||
title={Scaling down to scale up: A guide to parameter-efficient fine-tuning},
|
||||
author={Lialin, Vladislav and Deshpande, Vijeta and Rumshisky, Anna},
|
||||
journal={arXiv preprint arXiv:2303.15647},
|
||||
year={2023}
|
||||
}
|
||||
|
||||
@article{lu2023uniadapter,
|
||||
title={Uniadapter: Unified parameter-efficient transfer learning for cross-modal modeling},
|
||||
author={Lu, Haoyu and Huo, Yuqi and Yang, Guoxing and Lu, Zhiwu and Zhan, Wei and Tomizuka, Masayoshi and Ding, Mingyu},
|
||||
journal={arXiv preprint arXiv:2302.06605},
|
||||
year={2023}
|
||||
}
|
||||
|
||||
@article{fedus2022switch,
|
||||
title={Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity},
|
||||
author={Fedus, William and Zoph, Barret and Shazeer, Noam},
|
||||
journal={Journal of Machine Learning Research},
|
||||
volume={23},
|
||||
number={120},
|
||||
pages={1--39},
|
||||
year={2022}
|
||||
}
|
||||
@article{lepikhin2020gshard,
|
||||
title={Gshard: Scaling giant models with conditional computation and automatic sharding},
|
||||
author={Lepikhin, Dmitry and Lee, HyoukJoong and Xu, Yuanzhong and Chen, Dehao and Firat, Orhan and Huang, Yanping and Krikun, Maxim and Shazeer, Noam and Chen, Zhifeng},
|
||||
journal={arXiv preprint arXiv:2006.16668},
|
||||
year={2020}
|
||||
}
|
||||
@article{luo2024moelora,
|
||||
title={Moelora: Contrastive learning guided mixture of experts on parameter-efficient fine-tuning for large language models},
|
||||
author={Luo, Tongxu and Lei, Jiahe and Lei, Fangyu and Liu, Weihao and He, Shizhu and Zhao, Jun and Liu, Kang},
|
||||
journal={arXiv preprint arXiv:2402.12851},
|
||||
year={2024}
|
||||
}
|
||||
@article{guo2024large,
|
||||
title={Large language model based multi-agents: A survey of progress and challenges},
|
||||
author={Guo, Taicheng and Chen, Xiuying and Wang, Yaqi and Chang, Ruidi and Pei, Shichao and Chawla, Nitesh V and Wiest, Olaf and Zhang, Xiangliang},
|
||||
journal={arXiv preprint arXiv:2402.01680},
|
||||
year={2024}
|
||||
}
|
||||
@article{zhao2023survey,
|
||||
title={A survey of large language models},
|
||||
author={Zhao, Wayne Xin and Zhou, Kun and Li, Junyi and Tang, Tianyi and Wang, Xiaolei and Hou, Yupeng and Min, Yingqian and Zhang, Beichen and Zhang, Junjie and Dong, Zican and others},
|
||||
journal={arXiv preprint arXiv:2303.18223},
|
||||
year={2023}
|
||||
}
|
||||
@article{gao2024higher,
|
||||
title={Higher layers need more lora experts},
|
||||
author={Gao, Chongyang and Chen, Kezhen and Rao, Jinmeng and Sun, Baochen and Liu, Ruibo and Peng, Daiyi and Zhang, Yawen and Guo, Xiaoyuan and Yang, Jie and Subrahmanian, VS},
|
||||
journal={arXiv preprint arXiv:2402.08562},
|
||||
year={2024}
|
||||
}
|
||||
@inproceedings{dou2024loramoe,
|
||||
title={LoRAMoE: Alleviating world knowledge forgetting in large language models via MoE-style plugin},
|
||||
author={Dou, Shihan and Zhou, Enyu and Liu, Yan and Gao, Songyang and Shen, Wei and Xiong, Limao and Zhou, Yuhao and Wang, Xiao and Xi, Zhiheng and Fan, Xiaoran and others},
|
||||
booktitle={Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
|
||||
pages={1932--1945},
|
||||
year={2024}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@article{achiam2023gpt,
|
||||
title={Gpt-4 technical report},
|
||||
author={Achiam, Josh and Adler, Steven and Agarwal, Sandhini and Ahmad, Lama and Akkaya, Ilge and Aleman, Florencia Leoni and Almeida, Diogo and Altenschmidt, Janko and Altman, Sam and Anadkat, Shyamal and others},
|
||||
journal={arXiv preprint arXiv:2303.08774},
|
||||
year={2023}
|
||||
}
|
||||
@article{jaszczur2021sparse,
|
||||
title={Sparse is enough in scaling transformers},
|
||||
author={Jaszczur, Sebastian and Chowdhery, Aakanksha and Mohiuddin, Afroz and Kaiser, Lukasz and Gajewski, Wojciech and Michalewski, Henryk and Kanerva, Jonni},
|
||||
journal={Advances in Neural Information Processing Systems},
|
||||
volume={34},
|
||||
pages={9895--9907},
|
||||
year={2021}
|
||||
}
|
||||
@inproceedings{standley2020tasks,
|
||||
title={Which tasks should be learned together in multi-task learning?},
|
||||
author={Standley, Trevor and Zamir, Amir and Chen, Dawn and Guibas, Leonidas and Malik, Jitendra and Savarese, Silvio},
|
||||
booktitle={International conference on machine learning},
|
||||
pages={9120--9132},
|
||||
year={2020},
|
||||
organization={PMLR}
|
||||
}
|
||||
@article{cai2024survey,
|
||||
title={A survey on mixture of experts},
|
||||
author={Cai, Weilin and Jiang, Juyong and Wang, Fan and Tang, Jing and Kim, Sunghun and Huang, Jiayi},
|
||||
journal={arXiv preprint arXiv:2407.06204},
|
||||
year={2024}
|
||||
}
|
||||
@article{karimi2021compacter,
|
||||
title={Compacter: Efficient low-rank hypercomplex adapter layers},
|
||||
author={Karimi Mahabadi, Rabeeh and Henderson, James and Ruder, Sebastian},
|
||||
journal={Advances in Neural Information Processing Systems},
|
||||
volume={34},
|
||||
pages={1022--1035},
|
||||
year={2021}
|
||||
}
|
||||
@article{bommasani2021opportunities,
|
||||
title={On the opportunities and risks of foundation models},
|
||||
author={Bommasani, Rishi and Hudson, Drew A and Adeli, Ehsan and Altman, Russ and Arora, Simran and von Arx, Sydney and Bernstein, Michael S and Bohg, Jeannette and Bosselut, Antoine and Brunskill, Emma and others},
|
||||
journal={arXiv preprint arXiv:2108.07258},
|
||||
year={2021}
|
||||
}
|
||||
@article{pan2024lisa,
|
||||
title={LISA: Layerwise Importance Sampling for Memory-Efficient Large Language Model Fine-Tuning},
|
||||
author={Pan, Rui and Liu, Xiang and Diao, Shizhe and Pi, Renjie and Zhang, Jipeng and Han, Chi and Zhang, Tong},
|
||||
journal={arXiv preprint arXiv:2403.17919},
|
||||
year={2024}
|
||||
}
|
||||
@article{feng2024mixture,
|
||||
title={Mixture-of-loras: An efficient multitask tuning for large language models},
|
||||
author={Feng, Wenfeng and Hao, Chuzhan and Zhang, Yuewei and Han, Yu and Wang, Hao},
|
||||
journal={arXiv preprint arXiv:2403.03432},
|
||||
year={2024}
|
||||
}
|
||||
@article{lester2021power,
|
||||
title={The power of scale for parameter-efficient prompt tuning},
|
||||
author={Lester, Brian and Al-Rfou, Rami and Constant, Noah},
|
||||
journal={arXiv preprint arXiv:2104.08691},
|
||||
year={2021}
|
||||
}
|
||||
@article{zhou2024lima,
|
||||
title={Lima: Less is more for alignment},
|
||||
author={Zhou, Chunting and Liu, Pengfei and Xu, Puxin and Iyer, Srinivasan and Sun, Jiao and Mao, Yuning and Ma, Xuezhe and Efrat, Avia and Yu, Ping and Yu, Lili and others},
|
||||
journal={Advances in Neural Information Processing Systems},
|
||||
volume={36},
|
||||
year={2024}
|
||||
}
|
||||
@article{wei2021finetuned,
|
||||
title={Finetuned language models are zero-shot learners},
|
||||
author={Wei, Jason and Bosma, Maarten and Zhao, Vincent Y and Guu, Kelvin and Yu, Adams Wei and Lester, Brian and Du, Nan and Dai, Andrew M and Le, Quoc V},
|
||||
journal={arXiv preprint arXiv:2109.01652},
|
||||
year={2021}
|
||||
}
|
||||
|
||||
@article{brynjolfsson2025generative,
|
||||
title={Generative AI at work},
|
||||
author={Brynjolfsson, Erik and Li, Danielle and Raymond, Lindsey},
|
||||
journal={The Quarterly Journal of Economics},
|
||||
pages={qjae044},
|
||||
year={2025},
|
||||
publisher={Oxford University Press}
|
||||
}
|
||||
@Misc{peft,
|
||||
title = {PEFT: State-of-the-art Parameter-Efficient Fine-Tuning methods},
|
||||
author = {Sourab Mangrulkar and Sylvain Gugger and Lysandre Debut and Younes Belkada and Sayak Paul and Benjamin Bossan},
|
||||
howpublished = {\url{https://github.com/huggingface/peft}},
|
||||
year = {2022}
|
||||
}
|
||||
@article{li2023chatdoctor,
|
||||
title={ChatDoctor: A Medical Chat Model Fine-Tuned on a Large Language Model Meta-AI (LLaMA) Using Medical Domain Knowledge},
|
||||
author={Li, Yunxiang and Li, Zihan and Zhang, Kai and Dan, Ruilong and Jiang, Steve and Zhang, You},
|
||||
journal={Cureus},
|
||||
volume={15},
|
||||
number={6},
|
||||
year={2023},
|
||||
publisher={Cureus}
|
||||
}
|
||||
@online{DatabricksBlog2023DollyV2,
|
||||
author = {Mike Conover and Matt Hayes and Ankit Mathur and Jianwei Xie and Jun Wan and Sam Shah and Ali Ghodsi and Patrick Wendell and Matei Zaharia and Reynold Xin},
|
||||
title = {Free Dolly: Introducing the World's First Truly Open Instruction-Tuned LLM},
|
||||
year = {2023},
|
||||
url = {https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm},
|
||||
urldate = {2023-06-30}
|
||||
}
|
||||
@inproceedings{nakano2021webgpt,
|
||||
author = {Reiichiro Nakano and Jacob Hilton and Suchir Balaji and Jeff Wu and Long Ouyang and Christina Kim and Christopher Hesse and Shantanu Jain and Vineet Kosaraju and William Saunders and Xu Jiang and Karl Cobbe and Tyna Eloundou and Gretchen Krueger and Kevin Button and Matthew Knight and Benjamin Chess and John Schulman},
|
||||
title = {WebGPT: Browser-assisted question-answering with human feedback},
|
||||
booktitle = {arXiv},
|
||||
year = 2021,
|
||||
}
|
||||
@inproceedings{zhang2023automatic,
|
||||
title={Automatic Chain of Thought Prompting in Large Language Models},
|
||||
author={Zhang, Zhuosheng and Zhang, Aston and Li, Mu and Smola, Alex},
|
||||
booktitle={The Eleventh International Conference on Learning Representations (ICLR 2023)},
|
||||
year={2023}
|
||||
}
|
||||
@misc{codealpaca,
|
||||
author = {Sahil Chaudhary},
|
||||
title = {Code Alpaca: An Instruction-following LLaMA model for code generation},
|
||||
year = {2023},
|
||||
publisher = {GitHub},
|
||||
journal = {GitHub repository},
|
||||
howpublished = {\url{https://github.com/sahil280114/codealpaca}},
|
||||
}
|
||||
@article{zhao2024hypermoe,
|
||||
title={HyperMoE: Towards Better Mixture of Experts via Transferring Among Experts},
|
||||
author={Zhao, Hao and Qiu, Zihan and Wu, Huijia and Wang, Zili and He, Zhaofeng and Fu, Jie},
|
||||
journal={arXiv preprint arXiv:2402.12656},
|
||||
year={2024}
|
||||
}
|
||||
690
mypaper/AAAI2026_RoSA.tex
Executable file
690
mypaper/AAAI2026_RoSA.tex
Executable file
@@ -0,0 +1,690 @@
|
||||
\title{RoSA: Enhancing Parameter-Efficient Fine-Tuning via \\ RoPE-aware Selective Adaptation in Large Language Models}
|
||||
|
||||
\input{0_misc}
|
||||
\begin{abstract}
|
||||
Fine-tuning large language models is essential for task-specific adaptation, yet it remains computationally prohibitive. Parameter-Efficient Fine-Tuning (PEFT) methods have emerged as a solution, but current approaches typically ignore the distinct roles of model components and the heterogeneous importance across layers, thereby limiting adaptation efficiency.
|
||||
Motivated by the observation that Rotary Position Embeddings (RoPE) induce critical activations in the low-frequency dimensions of attention states, we propose RoPE-aware Selective Adaptation (RoSA), a novel PEFT framework that allocates trainable parameters in a more targeted and effective manner.
|
||||
RoSA comprises a RoPE-aware Attention Enhancement (RoAE) module, which selectively enhances the low-frequency components of RoPE-influenced attention states, and a Dynamic Layer Selection (DLS) strategy that adaptively identifies and updates the most critical layers based on LayerNorm gradient norms.
|
||||
By combining dimension-wise enhancement with layer-wise adaptation, RoSA achieves more targeted and efficient fine-tuning.
|
||||
Extensive experiments on fifteen commonsense and arithmetic benchmarks demonstrate that RoSA outperforms existing mainstream PEFT methods under comparable trainable parameters. The code is available to ease reproducibility\footnote{\codelink}.
|
||||
\end{abstract}
|
||||
|
||||
|
||||
|
||||
\section{Introduction} \label{sec:intro}
|
||||
Large Language Models (LLMs) have achieved remarkable success across a wide range of natural language processing (NLP) tasks, becoming a foundational infrastructure in numerous real-world applications. %TODO
|
||||
However, deploying these large-scale models often requires fine-tuning to align models with specific task requirements~\cite{peng2023instruction}.
|
||||
Traditional fine-tuning methods, such as full-parameter fine-tuning, are extremely resource-intensive, severely constraining their broader applicability.
|
||||
Consequently, exploring Parameter-Efficient Fine-Tuning~(PEFT) methods, which aim to substantially reduce fine-tuning costs without compromising model performance, has emerged as a key research focus in the LLM community~\cite{ding2023parameter}.
|
||||
|
||||
Recent PEFT methods typically aim to adapt LLMs to specific downstream tasks by fine-tuning only a small fraction of parameters, significantly reducing computational cost compared to traditional full-parameter fine-tuning. For example, mainstream PEFT methods such as P-tuning~\cite{liu2021p}, LoRA~\cite{hu2021lora}, DoRA~\cite{liu2024dora}, and C3A~\cite{chen2024parameter},
|
||||
introduce lightweight and trainable adaptation modules into the pre-trained model, keeping most of the original model parameters frozen.
|
||||
|
||||
Despite advancements, existing PEFT methods exhibit two critical limitations:
|
||||
\textbf{(1) Component-Heterogeneity Neglect:} Current methods largely neglect the intrinsic functional roles of LLM components~\cite{zhang2023adalora}. For instance, LoRA inserts low-rank matrices into the linear layers of attention and feed-forward blocks, enabling adaptation with minimal trainable parameters. However, such designs are applied uniformly across modules without analyzing their distinct functional roles.
|
||||
\textbf{(2) Layer-Heterogeneity Neglect:} Existing approaches often overlook the diversity across layers.
|
||||
However, LLMs capture syntax in lower layers, semantics in higher layers~\cite{voita2019bottom}.
|
||||
Most PEFT methods apply uniform adaptation schemes across all layers, limiting the potential efficiency and effectiveness of parameter allocation.
|
||||
|
||||
\begin{figure}[t]
|
||||
\centering
|
||||
\begin{subfigure}[b]{0.47\linewidth}
|
||||
\hspace{-3px}
|
||||
\includegraphics[width=\linewidth]{assets/Layer10.pdf}
|
||||
\caption{Across Head Dimensions}
|
||||
\label{fig:attnindim}
|
||||
\end{subfigure}
|
||||
\hfill
|
||||
\begin{subfigure}[b]{0.48\linewidth}
|
||||
\hspace{-3px}
|
||||
\includegraphics[width=\linewidth]{assets/AcrossLayer.pdf}
|
||||
\caption{Across Layers}
|
||||
\label{fig:attninlayer}
|
||||
\end{subfigure}
|
||||
\caption{Q-state activation strength visualizations in LLaMA-2-7B.
|
||||
We compute the average L2 norm per attention head to quantify activation strength.
|
||||
Stronger activations are concentrated in high-indexed (\ie low-RoPE frequency) dimensions and vary across layers, highlighting both dimension-wise and layer-wise heterogeneity.
|
||||
}
|
||||
\label{fig:hotattn}
|
||||
\end{figure}
|
||||
|
||||
Our approach is motivated by a key observation regarding LLM architectures: different components exhibit distinct roles and activation behaviors.
|
||||
Recent studies suggest that Feed-Forward Networks (FFN) act as repositories for storing factual knowledge, while Multi-Head Attention (MHA) modules function primarily for knowledge retrieval and contextual routing~\cite{geva2021transformer}.
|
||||
A key component within the MHA module is the Rotary Position Embedding (RoPE)~\cite{su2024roformer}, which plays a critical role in contextual understanding by encoding positional information into attention mechanisms. RoPE achieves this by applying pair-wise complex rotations to the Query (Q) and Key (K) state tensors of attention mechanism and the sinusoidal frequency increases geometrically across successive dimension pairs.
|
||||
|
||||
This frequency-based encoding introduces unique activation patterns.
|
||||
As shown in Fig.\ref{fig:hotattn}(\subref{fig:attnindim}), there are obvious distinctions in Q-state activations across different dimensional channels.
|
||||
Specifically, low-frequency components (corresponding to higher-indexed dimensions within each half of the attention states) exhibit denser and more intense activations, while high-frequency shows sparser activations.
|
||||
Analyses confirm that these prominent low-frequency activations are crucial for contextual understanding~\cite{barbero2024round, jin2025massive}.
|
||||
Furthermore, Fig.\ref{fig:hotattn}(\subref{fig:attninlayer}) reveals that this activation intensity is also highly heterogeneous across different layers, suggesting their contributions are not equal.
|
||||
These findings highlight that targeting these critical low-frequency components and the varying importance across layers for fine-tuning hold significant potential for enhancing both model performance and parameter efficiency.
|
||||
|
||||
Building on this, we propose a novel parameter-efficient fine-tuning method called RoPE-aware Selective Adaptation (RoSA). Specifically, RoSA integrates two complementary modules:
|
||||
(1) \textit{a RoPE-aware Attention Enhancement (RoAE)} module, explicitly designed to adaptively enhance the distinctive low-frequency components within query/key states influenced by the RoPE mechanism, thereby enhancing the model's contextual understanding capabilities with high parameter efficiency.
|
||||
(2) a \textit{Dynamic Layer Selection~(DLS)} strategy, enabling RoSA to dynamically identify and adapt only the most critical layers during fine-tuning. Specifically, layer importance is quantified by computing the gradient norm of Layer Normalization parameters, serving as a reliable proxy for determining each layer's contribution to model performance.
|
||||
By simultaneously leveraging RoPE's inherent structural characteristics and dynamically allocating fine-tuning resources to layers that matter most, RoSA substantially improves parameter efficiency and model effectiveness compared to existing PEFT techniques. The main contributions of this paper are summarized as follows:
|
||||
\begin{itemize}[leftmargin=*, topsep=0pt]
|
||||
\item To our knowledge, among PEFT works, we are the first to explicitly consider the distinctive low-frequency attention components induced by RoPE and propose RoAE, a RoPE-aware PEFT module that performs targeted enhancement of these functionally key dimensions. This adaptation effectively strengthens contextual understanding capabilities in a highly parameter-efficient manner.
|
||||
\item We introduce RoSA, a comprehensive PEFT framework that combines the RoAE module with a Dynamic Layer Selection (DLS) strategy. Specifically, DLS adaptively identifies and selectively updates the most impactful layers based on gradient norms of Layer Normalization parameters. Thus, RoSA optimally allocates parameters both dimension-wise and layer-wise according to their functional importance, enhancing overall efficiency.
|
||||
\item Extensive experiments on fifteen public benchmark datasets, using three backbone models and covering commonsense and arithmetic QA tasks, demonstrate that RoSA significantly outperforms existing mainstream PEFT methods under comparable trainable parameter scales, validating both its efficiency and effectiveness.
|
||||
\end{itemize}
|
||||
|
||||
\begin{figure*}[ht]
|
||||
\centering
|
||||
\includegraphics[width=0.81\linewidth]{assets/rosa_arch_deeper.pdf}
|
||||
\caption{The architecture of RoSA. RoSA consists of two key modules: RoPE-aware Attention Enhancement (RoAE), which selectively enhances low-frequency components of RoPE-influenced Q/K states, and
|
||||
Dynamic Layer Selection (DLS), which dynamically selects important layers for update. % based on LayerNorm gradient signals.
|
||||
Enabling targeted, efficient adaptation both frequency-wise and layer-wise.}
|
||||
\label{fig:framework}
|
||||
\end{figure*}
|
||||
|
||||
\section{Preliminaries} \label{sec:pre}
|
||||
This section reviews the key architectural components of LLMs and the RoPE mechanism, which are essential for understanding the motivations behind our method.
|
||||
\subsection{LLM Architecture}
|
||||
Modern LLMs, such as the LLaMA family~\cite{touvron2023llama}, are primarily built upon the decoder-only Transformer architecture~\cite{vaswani2017attention}. This architecture consists of a stack of identical Transformer blocks, each containing two primary components: a Multi-Head Self-Attention (MHSA) module and a Feed-Forward Network (FFN) module.
|
||||
The MHSA module allows the model to weigh the importance of different tokens in the input sequence, capturing complex contextual relationships.
|
||||
To incorporate crucial information about token order, which self-attention itself lacks, these models integrate positional encodings. Specifically, modern LLMs heavily adopt the Rotary Position Embedding (RoPE)~\cite{su2024roformer} as a relative positional encoding mechanism, which directly injects relative positional information into the attention computation and plays a crucial role in the model's ability to generalize over long contexts.
|
||||
The FFN, typically composed of two linear layers with a non-linear activation function, is responsible for feature transformation and is believed to be a key repository of factual and commonsense knowledge stored within the model's parameters~\cite{geva2021transformer}.
|
||||
A residual connection~\cite{he2016deep} is applied around each of the two sub-modules, followed by a Layer Normalization step.
|
||||
Most LLMs utilize Pre-LN for enhanced training stability, where normalization is applied directly to the input of each sub-module. In this design, LayerNorm acts as a bridge between residual stream and subsequent attention or FFN modules, modulating the information flow across modules and layers.
|
||||
\subsection{Rotary Position Embedding (RoPE)}
|
||||
As mentioned in the previous section, the original self-attention mechanism is inherently permutation-invariant, meaning that the order of input tokens does not affect the output.
|
||||
Therefore, an external mechanism is required to encode token positions.
|
||||
While early models use additive, learned absolute position embeddings, modern LLMs widely adopt Rotary Position Embedding (RoPE)~\cite{su2024roformer} due to its effectiveness and efficiency in encoding relative positional information, especially for long sequences.
|
||||
|
||||
RoPE injects positional information by applying a rotational transformation directly to the Query ($q$) and Key ($k$) vectors in each attention head.
|
||||
Specifically, given a vector $\mathbf{z} \in \mathbb{R}^d$, where $d$ is even, RoPE splits it into two halves: a \textit{real} part $\mathbf{z}^{\text{real}}$ and an \textit{imaginary} part $\mathbf{z}^{\text{imag}}$, each of dimension $d/2$. Then, for each index $i$, RoPE treats $(\mathbf{z}^{\text{real}}_i, \mathbf{z}^{\text{imag}}_i)$ as a complex-valued component and applies a 2D rotation:
|
||||
|
||||
\begin{equation}
|
||||
\text{RoPE}(\mathbf{z}^{\text{real}}_i, \mathbf{z}^{\text{imag}}_i) =
|
||||
\begin{bmatrix}
|
||||
\cos \theta_i & -\sin \theta_i \\
|
||||
\sin \theta_i & \cos \theta_i
|
||||
\end{bmatrix}
|
||||
\begin{bmatrix}
|
||||
\mathbf{z}^{\text{real}}_i \\
|
||||
\mathbf{z}^{\text{imag}}_i
|
||||
\end{bmatrix},
|
||||
\end{equation}
|
||||
where $\theta_i = t \cdot \omega^{-2i/d}$, $t$ is the token position index, and $\omega$ is a base frequency constant (commonly set to $10{,}000$). This operation is equivalent to applying a complex-valued sinusoidal rotation, enabling relative positional relationships to be encoded directly into the attention mechanism. Since each rotation is applied to the corresponding dimensions in the two halves of the vector, both halves share the same rotation frequency $\theta_i$. As observed in Fig.\ref{fig:hotattn}(\subref{fig:attnindim}), the activation patterns exhibit similarity, highlighting the impact of RoPE on the attention mechanism across dimensions.
|
||||
|
||||
|
||||
As $\theta_i$ decreases geometrically with the index $i$, low-indexed dimensions encode high-frequency positional patterns, while the high-indexed dimensions encode low-frequency, smoother components.
|
||||
These low-frequency components often produce stronger and denser activations, and are crucial for long-range dependency modeling.
|
||||
These observations suggest that the frequency structure induced by RoPE provides a meaningful basis for improving PEFT methods.
|
||||
In this work, we explicitly target the low-frequency components of RoPE-influenced attention states, aiming to enhance parameter efficiency in a more targeted manner.
|
||||
|
||||
|
||||
\section{Method}
|
||||
|
||||
In this section, we first provide an overview of the RoSA framework, then describe its two core components in detail, and finally present the overall algorithm.
|
||||
|
||||
\subsection{Framework Overview}
|
||||
Existing PEFT methods often overlook two key aspects of LLMs: (\textit{i}) the frequency-specific structure introduced by RoPE, and (\textit{ii}) the layer-wise importance heterogeneity during adaptation.
|
||||
This motivates us to design a more targeted and adaptive fine-tuning strategy.
|
||||
To address these challenges, we propose RoPE-aware Selective Adaptation (RoSA). The core idea is to achieve a more targeted and efficient fine-tuning through a dual-level adaptation strategy, targeting critical low-frequency dimensions within layers and selecting the most important layers across the model.
|
||||
|
||||
As illustrated in Fig.\ref{fig:framework}, RoSA achieves this through two main components.
|
||||
First, the RoPE-aware Attention Enhancement (RoAE) module selectively enhancing the low-frequency components of RoPE-influenced attention states, which play a critical role in contextual understanding.
|
||||
Further, the Dynamic Layer Selection (DLS) module identifies and adapts the most important layers during fine-tuning based on a gradient importance metric.
|
||||
By combining frequency-wise and layer-wise selective adaptation, RoSA achieves a more effective and efficient adaptation process.
|
||||
|
||||
\subsection{RoPE-aware Attention Enhancement (RoAE)}
|
||||
Based on the observation that the low-frequency dimensions of RoPE-rotated attention states play a critical role in modeling long-range dependencies and contextual semantics~\cite{barbero2024round, jin2025massive}.
|
||||
However, conventional PEFT methods do not explicitly consider this frequency structure, instead applying generic adaptations across all dimensions.
|
||||
This limits their efficiency and effectiveness.
|
||||
To address this, we introduce the RoPE-aware Attention Enhancement (RoAE) module, which selectively enhances the low-frequency components within the Query (Q) and Key (K) attention states in a lightweight and targeted manner.
|
||||
|
||||
\subsubsection{Low-Frequency Components Selection:}
|
||||
Given the hidden states $\mathbf{H} \in \mathbb{R}^{b \times l \times d}$ as input to a Transformer layer, where $b$ is the batch size, $l$ is the sequence length, and $d$ is the hidden dimension. After applying the standard linear projections to obtain the query and key tensors, these are reshaped into multi-head representations with shape $[b, h, l, d_h]$, where $h$ is the number of attention heads and $d_h = d / h$ is the dimension per head.
|
||||
RoPE first splits each head vector into real $\mathbf{z}_{\text{real}}$ and imaginary $\mathbf{z}_{\text{imag}}$ halves, then applies a sinusoidal rotation to every resulting complex pair.
|
||||
|
||||
To extract the low-frequency components, we follow the structure of RoPE and split each head vector into two halves of size $d_h/2$. From each half, we take the last
|
||||
$(d_h \cdot r_{\text{low}}) / 2$
|
||||
dimensions and concatenate them to form a $d_{\text{low}}$-dimensional vector, denoted as $\mathbf{z}_{\text{low}}$. Here, $r_{\text{low}} \in (0, 1)$ is a hyperparameter controlling the ratio of the targeted low-frequency components.
|
||||
This extracted vector captures the critical low-frequency components of the RoPE-influenced Q/K head, serving as the target for enhancement.
|
||||
|
||||
\subsubsection{Adaptation Signal Generation:}
|
||||
To enhance the extracted low-frequency components in a targeted way, we first generate a context-aware adaptation signal $\mathbf{S}$.
|
||||
Specifically, the hidden state is passed through a trainable linear projection, $\mathbf{W}_{\text{proj}}$, followed by a non-linear activation (SiLU)~\cite{elfwing2018sigmoid} to introduce non-linearity:
|
||||
\begin{equation}
|
||||
\tilde{\mathbf{S}} = \text{SiLU}(\mathbf{H} \mathbf{W}_{\text{proj}}), \quad \mathbf{W}_{\text{proj}} \in \mathbb{R}^{d \times (h \cdot d_{\text{low}})},
|
||||
\label{eq:roae-proj}
|
||||
\end{equation}
|
||||
where $\tilde{\mathbf{S}} \in \mathbb{R}^{b \times l \times (h \cdot d_{\text{low}})}$. Similarly, we then reshape the projected tensors to the multi-head shape $\mathbf{S}\in\mathbb{R}^{b \times h \times l \times d_{\text{low}}}$.
|
||||
|
||||
Notably, to improve parameter efficiency, the projection module $\mathbf{W}_{\text{proj}}$ is implemented using a low-rank decomposition ($\mathbf{W}_{\text{proj}} = \mathbf{B}\mathbf{A}$), adding only a small number of trainable parameters.
|
||||
Further, this design remains compatible and can be flexibly replaced by other emerging PEFT methods.
|
||||
|
||||
In typical settings, we use the same adaptation signal $\mathbf{S}$ for both query and key projections.
|
||||
To ensure compatibility with modern architectures employing Grouped-Query Attention (GQA)~\cite{ainslie2023gqa}, where the number of query and key heads, denoted by $h_q$ and $h_k$, may differ, we apply an additional projection module to align the dimensions:
|
||||
\begin{equation}
|
||||
\tilde{\mathbf{S}}^{(K)} = \tilde{\mathbf{S}}^{(Q)} \cdot \mathbf{W}_{\text{GQA}}, \quad \mathbf{W}_{\text{GQA}} \in \mathbb{R}^{(h_q \cdot d_{\text{low}}) \times (h_k \cdot d_{\text{low}})},
|
||||
\label{eq:roae-gqa}
|
||||
\end{equation}
|
||||
ensuring compatibility across varying attention configs, thereby enabling RoAE to support GQA-enabled models.
|
||||
|
||||
\subsubsection{Targeted Enhancement Application: }
|
||||
After obtaining the adaptation signal $\mathbf{S}$, the final step is to apply it to the targeted low-frequency components.
|
||||
Recall that in the previous step, we extracted the low-frequency vectors $\mathbf{z}_{\text{low}}$ of each head.
|
||||
Denoting the extracted low-frequency components for all attention heads as $\mathbf{Z}\in\mathbb{R}^{b\times h\times l\times d_{\text{low}}}$, we perform the enhancement via an element-wise multiply modulation:
|
||||
\begin{equation}
|
||||
\mathbf{Z}^{*} = \mathbf{Z} + \mathbf{Z}\odot(\alpha\cdot\mathbf{S}),
|
||||
\label{eq:roae-apply}
|
||||
\end{equation}
|
||||
here $\alpha$ is a scaling factor controlling the adaptation strength.
|
||||
|
||||
|
||||
|
||||
Finally, the enhanced low-frequency tensors $\mathbf{Z}^{*}$ are re-integrated into their original positions of the attention head states, replacing the corresponding low-frequency dimensions.
|
||||
The attention mechanism then proceeds with these selectively enhanced query and key representations, allowing the model to better leverage RoPE's critical frequency structure for improved contextual understanding abilities.
|
||||
|
||||
In summary, the RoAE module introduces a targeted and efficient PEFT paradigm.
|
||||
Its core innovation lies in its mechanism-aware design, which targets the critical components of RoPE-influenced attention states.
|
||||
Furthermore, the enhancement is context-aware, as the adaptation signal is dynamically generated from the input states to provide token-specific modulations.
|
||||
By achieving this with high parameter efficiency and maintaining compatibility across diverse architectures, RoAE establishes a more flexible and effective method for adapting LLMs into specific tasks.
|
||||
|
||||
\subsection{Dynamic Layer Selection (DLS)}
|
||||
While the RoAE module provides a targeted, mechanism-aware approach to adapting parameters within one layer, LLMs exhibit considerable heterogeneity across different layers, with lower layers primarily capturing syntactic features and higher layers encoding abstract semantic and contextual knowledge~\cite{voita2019bottom}.
|
||||
Applying it uniformly across all layers, like common PEFT methods, overlooks the layer-wise importance heterogeneity.
|
||||
To address this, we propose Dynamic Layer Selection (DLS) strategy, a method designed to dynamically select and adapt the most important layers, improving parameter utilization efficiency throughout the fine-tuning process.
|
||||
|
||||
\subsubsection{Layer Importance Estimation:}
|
||||
The core of DLS is to accurately estimate the importance of each layer with respect to the fine-tuning objective.
|
||||
We propose to use the gradient norm of Layer Normalization (LayerNorm) parameters as an efficient proxy for this task.
|
||||
Because LayerNorm directly controls information flow between Transformer submodules and layers. A large gradient for this parameter indicates that it is necessary for the model to significantly change the output distribution of this layer to minimize the loss.
|
||||
|
||||
In the common-adopted Pre-LN architecture, LayerNorm modules are placed before the self-attention and before the FFN module.
|
||||
Formally, for the $i$-th Transformer layer $L_i$, its importance score is calculated by aggregating the $\text{L}_2$ norms of the gradients from the LayerNorm parameters:
|
||||
\begin{equation}
|
||||
\text{Score}(L_i) = \sqrt{ \| \nabla \mathbf{\Theta}_{i, \text{attn}} \|_2^2 + \| \nabla \mathbf{\Theta}_{i, \text{ffn}} \|_2^2 }
|
||||
\label{eq:dls-calc}
|
||||
\end{equation}
|
||||
where $\mathbf{\Theta}_{i, \text{attn}}$ and $\mathbf{\Theta}_{i, \text{ffn}}$ represent the learnable parameters for the two LayerNorm modules in the $i$-th layer.
|
||||
In practice, we periodically compute these importance scores for all layers, providing an informative metric to guide selection.
|
||||
\subsubsection{Dynamic Selection and Gradient Masking:}
|
||||
The selection procedure is activated periodically at an interval of $u$ steps after an initial warmup phase. At each activation, DLS employs a strategy that balances exploitation and exploration to choose a subset of layers for updates, specifically:
|
||||
\begin{itemize}[leftmargin=*, topsep=0pt]
|
||||
\item \textbf{Exploitation:} With a high probability $p_{\text{exploit}}$, we rank all layers based on their scores and select the top-$k$ layers for training, where $k$ is determined by a predefined ratio $k_{\text{ratio}}$.
|
||||
\item \textbf{Exploration:} Conversely, with a probability of $1-p_{\text{exploit}}$, we randomly select $k$ layers to ensure that all layers have a chance to adapt, thus reducing the risk of local optima.
|
||||
\end{itemize}
|
||||
|
||||
Once the set of selected layers $\mathcal{L_S}$ is determined, a gradient mask is applied.
|
||||
Specifically, the gradients of parameters in all non-selected layers are set to 0 to prevent updating:
|
||||
\begin{equation}
|
||||
\nabla L_i \leftarrow \mathbf{0},\quad \text{if}\quad i \notin \mathcal{L_S}.
|
||||
\label{eq:dls-mask}
|
||||
\end{equation}
|
||||
|
||||
|
||||
In summary, DLS reduces unnecessary parameter updates by dynamically identifying and adapting only the most critical layers, leading to improved efficiency and potentially superior downstream task performance.
|
||||
It is noteworthy that DLS is model-agnostic and can be easily integrated into existing PEFT pipelines. Combined with RoAE, which enables selective adaptation over important frequency components, DLS completes the RoSA framework by jointly targeting both dimension-level and layer-level adaptation.
|
||||
|
||||
|
||||
\subsection{Overall Algorithm}
|
||||
RoSA integrates the RoAE and DLS modules into the standard causal language modeling framework, where the model is trained using cross-entropy loss between predicted and target tokens. These modules operate jointly, enabling targeted adaptation both across frequency dimensions and model layers, achieving effective and efficient fine-tuning.
|
||||
|
||||
The full training procedure is summarized in Algorithm~\ref{alg:rosa},
|
||||
which outlines how RoSA applies frequency-aware enhancements via RoAE and dynamically selects critical layers for update via DLS.
|
||||
Thus, RoSA optimally allocates parameters both dimension-wise and layer-wise according to their functional importance, enhancing overall efficiency.
|
||||
Importantly, RoSA can be seamlessly integrated into existing PEFT frameworks or combined with other fine-tuning techniques due to its modular and adaptive design.
|
||||
\begin{algorithm}[htbp]
|
||||
\caption{RoPE-aware Selective Adaptation (RoSA)}
|
||||
\label{alg:rosa}
|
||||
\begin{algorithmic}[1]
|
||||
\Require Pretrained LLM model $\mathcal{M}$, dataset $\mathcal{D}$, RoAE hyperparameters ($\alpha$, $r_{\text{low}}$), DLS hyperparameters ($k_{\text{ratio}}$, $p_{\text{exploit}}$, $u$), learning rate $\eta$, warmup steps $T_{\text{warmup}}$.
|
||||
\State Initialize RoAE modules with $\alpha$ and $r_{\text{low}}$ and integrate them into $\mathcal{M}$;
|
||||
\State Set only RoSA-related parameters $\mathbf{\Theta}_{\text{RoSA}}$ as trainable;
|
||||
\For{each training step $t$}
|
||||
\State Sample a batch of data from $\mathcal{D}$;
|
||||
\State Compute forward pass with RoAE enhanced attention states (Eq.~\ref{eq:roae-proj}-\ref{eq:roae-apply});
|
||||
\State Compute loss and perform backward pass to obtain gradients;
|
||||
\If{$t > T_{\text{warmup}}$ \textbf{and} $t$ mod $u$ == 0}
|
||||
\State Calculate layer importance $\text{Score}(L_i)$ using LayerNorm gradients (Eq.~\ref{eq:dls-calc});
|
||||
\State With probability $p_{\text{exploit}}$, select the top $k_{\text{ratio}}$ fraction of layers \textit{(DLS-Exploitation)};
|
||||
otherwise, randomly select $k_{\text{ratio}}$ fraction of layers \textit{(DLS-Exploration)};
|
||||
|
||||
\EndIf
|
||||
\State Mask gradients of parameters in non-selected layers (Eq.~\ref{eq:dls-mask});
|
||||
\State Update parameters of active layers using optimizer with learning rate $\eta$;
|
||||
\EndFor
|
||||
\end{algorithmic}
|
||||
\end{algorithm}
|
||||
|
||||
|
||||
\begin{table*}[t]
|
||||
\centering
|
||||
\small
|
||||
\resizebox{0.96\linewidth}{!}{
|
||||
\renewcommand{\arraystretch}{0.96}
|
||||
\begin{tabular}{l|lccccccccccc} % ← 无竖线的简洁风格
|
||||
\toprule
|
||||
\textbf{Backbone LLM} & \textbf{Baseline} & \textbf{\# Param (\%)} & \textbf{BoolQ} & \textbf{PIQA} & \textbf{SIQA} & \textbf{ARC-C} & \textbf{ARC-E} & \textbf{OBQA} & \textbf{HellaSwag} & \textbf{WinoGrande} & \textbf{micro-avg(\%)$\uparrow$} \\ % & \textbf{macro-avg$\uparrow$}
|
||||
\midrule
|
||||
\multirow{9}{*}{\textbf{Qwen 2.5 7B}}
|
||||
& LoRA&0.527 &66.9&86.8&76.7&88.2&93.9&87.2&89.7&72.2&84.3 \\
|
||||
& DoRA&0.546 &68.3&\underline{87.4}&77.2&\underline{89.4}&95.2&88.0&\underline{90.0}&70.4&84.9 \\
|
||||
& AdaLoRA&0.396 &\underline{69.7}&\underline{87.4}&\underline{77.9}&88.9&\textbf{95.7}&\underline{89.4}&\textbf{90.6}&72.6&\underline{85.6} \\
|
||||
& BOFT&0.023 &68.5&86.0&76.1&87.5&94.6&82.4&86.1&65.3&82.4 \\
|
||||
& VERA&0.018 &55.4&83.7&74.1&85.1&93.6&77.2&82.2&64.1&77.9 \\
|
||||
& C3A&0.665 &69.5&87.0&77.5&88.9&95.2&86.6&89.9&71.6&85.0 \\
|
||||
& BONE&0.291 &67.6&84.9&76.8&85.2&94.3&87.4&88.3&\textbf{77.9}&83.9 \\
|
||||
& LN Tuning&0.001 &62.5&86.0&73.3&85.0&93.3&77.2&80.9&62.1&78.4 \\
|
||||
& \framework (ours)&0.261 &\textbf{70.5}&\textbf{88.0}&\textbf{79.1}&\textbf{90.1}&\underline{95.3}&\textbf{89.6}&\textbf{90.6}&\underline{73.7}&\textbf{85.9}* \\
|
||||
\midrule
|
||||
\multirow{9}{*}{\textbf{Llama 3.1 8B}}
|
||||
& LoRA&0.520 &\textbf{71.7}&86.8&75.5&83.1&\underline{92.7}&82.4&\underline{88.6}&68.8&83.7 \\
|
||||
& DoRA&0.537 &71.5&86.9&75.8&83.2&92.5&82.2&88.5&70.0&83.8 \\
|
||||
& AdaLoRA&0.390 &71.1&86.2&74.7&\textbf{83.6}&92.6&82.8&87.2&\underline{70.8}&83.0 \\
|
||||
& BOFT&0.028 &70.5&85.5&72.4&80.0&91.9&79.0&82.4&62.5&79.7 \\
|
||||
& VERA&0.017 &68.8&82.9&68.4&77.6&91.4&77.4&75.2&57.4&75.2 \\
|
||||
& C3A&0.674 &\underline{71.6}&\textbf{87.7}&\underline{76.2}&83.1&92.6&\textbf{84.4}&88.3&70.6&\underline{83.9} \\
|
||||
& BONE&0.274 &64.7&78.4&74.2&72.1&86.8&78.2&81.8&70.3&77.6 \\
|
||||
& LN Tuning&0.003 &70.1&84.6&70.9&80.2&91.8&78.8&80.6&61.8&78.6 \\
|
||||
& \framework (ours)&0.329 &\textbf{71.7}&\underline{87.1}&\textbf{76.4}&\underline{83.3}&\textbf{92.8}&\underline{83.6}&\textbf{89.0}&\textbf{74.8}&\textbf{84.4}* \\
|
||||
\midrule
|
||||
\multirow{9}{*}{\textbf{Gemma 2 9B}}
|
||||
& LoRA&0.581 &69.3&88.0&77.8&\textbf{88.0}&\textbf{95.5}&\underline{87.4}&89.8&\underline{77.4}&85.4 \\
|
||||
& DoRA&0.601 &70.0&87.3&\underline{78.1}&86.1&94.3&87.0&89.4&76.8&85.0 \\
|
||||
& AdaLoRA&0.437 &\underline{72.3}&\underline{88.2}&77.4&87.5&\textbf{95.5}&86.2&89.0&73.4&85.1 \\
|
||||
& BOFT&0.029 &65.2&83.2&72.4&81.7&91.1&75.0&80.3&62.1&77.7 \\
|
||||
& VERA&0.020 &65.2&79.8&66.0&73.8&85.8&61.8&70.5&56.1&70.9 \\
|
||||
& C3A&0.699 &70.7&87.7&77.7&86.9&\underline{94.5}&86.8&\textbf{90.4}&75.3&\underline{85.5} \\
|
||||
& BONE&0.319 &60.3&75.3&66.3&69.0&83.7&74.0&67.3&64.3&68.7 \\
|
||||
& LN Tuning&0.007 &61.2&78.1&66.1&73.2&85.0&65.0&71.9&55.1&70.7 \\
|
||||
& \framework (ours)&0.363 &\textbf{74.0}&\textbf{88.3}&\textbf{78.5}&\underline{87.8}&\textbf{95.5}&\textbf{87.8}&\underline{90.0}&\textbf{77.5}&\textbf{86.2}* \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
}
|
||||
\caption{Performance comparison of RoSA and baseline methods on the Commonsense QA task across three backbone LLMs.
|
||||
\textbf{{\large *}} indicates the statistically significant improvements (\ie two-sided t-test with $p<0.05$) over the best baseline.
|
||||
RoSA consistently achieves the highest average performance under comparable parameter budgets.}
|
||||
\label{tab:main_common}
|
||||
\vspace{-4px}
|
||||
\end{table*}
|
||||
|
||||
|
||||
\begin{table}[t]
|
||||
\centering
|
||||
\small
|
||||
\resizebox{1\linewidth}{!}{
|
||||
\renewcommand{\arraystretch}{1}
|
||||
\begin{tabular}{lcccc} % ← 无竖线的简洁风格
|
||||
\toprule
|
||||
\textbf{Baseline} & \textbf{Qwen2.5 0.5B} & \textbf{Qwen2.5 1.5B} & \textbf{Qwen2.5 3B} & \textbf{Qwen2.5 7B} \\
|
||||
\midrule
|
||||
AdaLoRA &\underline{53.5}&\underline{75.1}&81.1&\underline{85.6} \\
|
||||
C3A &53.1&74.9&\underline{81.2}&85.0 \\
|
||||
\framework (ours) &\textbf{53.7}&\textbf{75.5}&\textbf{82.0}&\textbf{85.9} \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
}
|
||||
\caption{Average Commonsense QA accuracy of RoSA, AdaLoRA, and C3A on varying sizes Qwen2.5 (0.5 to 7B).
|
||||
\label{tab:scale}
|
||||
}
|
||||
\vspace{-9px}
|
||||
\end{table}
|
||||
\section{Experiments}
|
||||
To comprehensively evaluate the performance of our proposed RoSA, we conduct extensive experiments guided by the following key research questions (RQs):
|
||||
|
||||
\begin{itemize}[leftmargin=*]
|
||||
\item \textbf{RQ1:} How does RoSA perform compared to state-of-the-art PEFT methods across different backbone LLMs and downstream tasks?
|
||||
\item \textbf{RQ2:} How does RoSA demonstrate scalability performance with backbone LLMs of different parameter sizes?
|
||||
\item \textbf{RQ3:} What are the contributions of each component within RoSA (RoAE and DLS) to its overall performance?
|
||||
\item \textbf{RQ4:} How do RoSA's key hyperparameters affect its overall performance?
|
||||
\end{itemize}
|
||||
|
||||
We first introduce the experimental setup and then systematically address each of the above research questions.
|
||||
|
||||
|
||||
\subsection{Experimental Setup}
|
||||
\subsubsection{Datasets}
|
||||
We follow the setup from LLM-Adapters~\cite{hu2023llm} and evaluate RoSA on two distinct tasks: Commonsense QA and Arithmetic QA.
|
||||
Specifically, we fine-tune models using \texttt{Commonsense15K} and \texttt{Math10K}, which are constructed from multiple data sources.
|
||||
For the \textit{Commonsense} task, we evaluate on eight diverse benchmarks: BoolQ, PIQA, SIQA, ARC-Challenge, ARC-Easy, OBQA, HellaSwag, and WinoGrande.
|
||||
Further, we assess performance of the \textit{Arithmetic} task on seven benchmarks: MultiArith, GSM8K, AddSub, AQuA, SingleEq, SVAMP, and MAWPS.
|
||||
We report accuracy on each benchmark as the evaluation metric. Further details can be found in Appendix.
|
||||
|
||||
\subsubsection{Backbone Models}
|
||||
We select three powerful and widely-used LLMs as backbone models to validate the generalization of RoSA: Qwen2.5-7B~\cite{bai2023qwen}, Llama-3.1-8B~\cite{dubey2024llama}, and Gemma2-9B~\cite{team2024gemma}.
|
||||
|
||||
\subsubsection{Baseline Methods}
|
||||
We evaluate our approach against a comprehensive set of recent and diverse PEFT methods. Specifically, we compare several low-rank methods and their variants, including the basic \textbf{LoRA}~\cite{hu2021lora}, its weights decomposing successor \textbf{DoRA}~\cite{liu2024dora}, dynamically rank-allocating \textbf{AdaLoRA}~\cite{zhang2023adalora}, and shared low-rank matrices \textbf{VERA}~\cite{kopiczko2023vera}.
|
||||
Methods leveraging more complex structured matrices, such as the orthogonality-enforcing \textbf{BOFT}~\cite{liu2023parameter}, the circular-convolution-based \textbf{C3A}~\cite{chen2024parameter}, and the block-affine-transformation-based \textbf{BONE}~\cite{kang2024balancing} are also introduced. Finally, a simple and effective method \textbf{LN Tuning}~\cite{zhao2023tuning} is included, which only tunes the model's Layer Normalization parameters.
|
||||
|
||||
\subsubsection{Implementation Details}
|
||||
All experiments are conducted on NVIDIA GeForce RTX 3090 using PyTorch and HuggingFace Transformers.
|
||||
We use an AdamW optimizer with a learning rate of 1e-3.
|
||||
Hyperparameters used in RoSA are as follows: low-freq dimension ratio $r_{\text{low}}$: 0.25, scaling factor $\alpha$: 0.1, low-rank projection dimension: 128, layer selection ratio $k_{\text{ratio}}$: 0.5, selection interval $u$: 40 steps and exploitation probability $p_{\text{exploit}}$: 0.8. For detailed implementation, please refer to the Appendix and our code for reproducibility\footnote{\codelink}.
|
||||
|
||||
\subsection{Overall Performance (RQ1, 2)}
|
||||
To answer RQ1, we compare RoSA against all baselines on two distinct tasks: Commonsense and Arithmetic QA.
|
||||
The results are summarized in Table \ref{tab:main_common} and Table \ref{tab:main_arith}, respectively.
|
||||
|
||||
As shown in Table \ref{tab:main_common}, RoSA consistently achieves the best performance across all three backbone models, maintaining relatively low trainable parameters. This confirms that the low-frequency components introduced by RoPE play a crucial role in improving the model's contextual understanding.
|
||||
Among LoRA variants, AdaLoRA's dynamic rank allocation yields better performance, aligning with the principles of dynamic selection of DLS module. Methods like C3A, which employ novel adapter designs, also show competitive results, highlighting the potential of more complex structured matrices for improving parameter efficiency.
|
||||
Additionally, LN Tuning, a simple and effective method, performs well with minimal trainable parameters, further supporting the use of LayerNorm as an importance proxy in DLS.
|
||||
|
||||
To validate RoSA's capabilities, we also conduct a focused comparison on the Arithmetic QA task, specifically using the Qwen2.5-7B model due to space constraints. The results, summarized in Table \ref{tab:main_arith}, are consistent with those observed in the Commonsense task, where RoSA still achieves the best performance among all methods.
|
||||
|
||||
To further answer RQ2, we investigate how RoSA's performance scales with model size. We evaluate four Qwen2.5 variants (0.5B, 1.5B, 3B, and 7B) on the Commonsense QA task, comparing against two strong baselines, AdaLoRA and C3A. As shown in Table~\ref{tab:scale}, all methods improve with larger models, but RoSA consistently maintains a clear advantage across scales, highlighting its robustness and scalability.
|
||||
|
||||
\begin{table*}[t]
|
||||
\centering
|
||||
\small
|
||||
\resizebox{0.94\linewidth}{!}{
|
||||
\renewcommand{\arraystretch}{0.95}
|
||||
\begin{tabular}{lcccccccccc}
|
||||
\toprule
|
||||
\textbf{Baseline} & \textbf{\# Param (\%)} & \textbf{MultiArith} & \textbf{GSM8K} & \textbf{AddSub} & \textbf{AQuA} & \textbf{SingleEq} & \textbf{SVAMP} & \textbf{MAWPS} & \textbf{micro-avg(\%)$\uparrow$} \\
|
||||
\midrule
|
||||
LoRA&0.527 &93.0&68.7&88.8&33.8&\underline{88.9}&79.2&88.2&77.7 \\
|
||||
DoRA&0.546 &92.3&\underline{70.0}&88.6&34.6&88.5&79.6&87.3&78.1 \\
|
||||
AdaLoRA&0.396 &90.0&68.8&85.3&33.8&85.6&78.9&84.0&76.3 \\
|
||||
BOFT&0.023 &89.6&67.8&82.5&31.1&86.2&75.2&80.2&74.6 \\
|
||||
VERA&0.018 &72.5&63.7&80.7&31.1&80.3&74.2&83.1&70.0 \\
|
||||
C3A&0.665 &\textbf{95.3}&67.1&\underline{90.3}&\textbf{35.4}&\textbf{90.1}&\underline{82.1}&\underline{89.4}&\underline{78.7} \\
|
||||
BONE&0.291 &92.8&66.6&89.6&33.4&88.3&\underline{82.1}&89.0&77.8 \\
|
||||
LN Tuning&0.001 &79.6&63.6&72.1&34.2&75.3&68.1&70.1&67.7 \\
|
||||
\framework (ours)&0.261 &\underline{94.3}&\textbf{71.3}&\textbf{92.1}&\underline{35.0}&\textbf{90.1}&\textbf{82.2}&\textbf{92.0}&\textbf{80.1}* \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
}
|
||||
\caption{Evaluation of RoSA and baseline methods on the Arithmetic QA task using the Qwen2.5-7B model. RoSA achieves the highest average accuracy across all benchmarks, demonstrating its generalization to mathematical tasks.}
|
||||
\label{tab:main_arith}
|
||||
\vspace{-4px}
|
||||
\end{table*}
|
||||
|
||||
|
||||
|
||||
\subsection{Ablation and Hyperparameter Analysis~(RQ3, 4)}
|
||||
We then perform ablation and hyperparameter studies to analyze RoSA components and sensitivity to hyperparameters. % systematically choices
|
||||
All results in this section are reported as average performance on the Commonsense QA task with Qwen2.5-7B.
|
||||
\subsubsection{Ablation Study: }
|
||||
We first conduct an ablation study comparing the full RoSA framework against several variants to evaluate the contributions of its components, as shown in Table~\ref{tab:abla}.
|
||||
The full \textbf{RoSA} model includes both RoAE and DLS. We first examine the \textbf{RoSA-RoAEonly} variant by disabling DLS for evaluating the impact of layer selection.
|
||||
We further investigate several RoAE replacement and modification variants, all retaining DLS:
|
||||
(i) \textbf{RoSA-RoAE0.5}, which sets the low-freq dimension ratio $r_{\text{low}}$ to 0.5 while keeping all other settings unchanged,
|
||||
(ii) \textbf{RoSA-Lr128}, which applies standard LoRA on Q/K with all other configs identical to RoSA, and
|
||||
(iii) \textbf{RoSA-Lr64}, which uses LoRA with a similar number of trainable parameters as RoSA.
|
||||
These variants also provide an implicit analysis of the effect of $r_{\text{low}}$, allowing us to compare targeted adaptation on varying frequency ranges.
|
||||
Overall, the results indicate that each component of RoSA contributes to performance, and focusing adaptation on a compact low-frequency subspace is more effective.
|
||||
|
||||
\subsubsection{Sensitivity of DLS: }
|
||||
|
||||
To further evaluate the DLS module, we analyze the sensitivity of the layer selection ratio $k_{\text{ratio}}$, which controls the proportion of layers updated during fine-tuning. We vary $k_{\text{ratio}}$ over a range of values. As summarized in Fig.~\ref{fig:sens_dls}, RoSA performs best when $k_{\text{ratio}} \approx 0.5$. Increasing this ratio slightly degrades performance, suggesting that selectively updating fewer layers leads to more efficient optimization and enhances overall model performance.
|
||||
|
||||
\begin{figure}[tb]
|
||||
\vspace{-3pt}
|
||||
\centering
|
||||
\resizebox{0.98\linewidth}{!}{
|
||||
|
||||
\begin{minipage}[t]{.45\columnwidth}
|
||||
\centering
|
||||
\captionsetup{font=small}
|
||||
\vspace{-0.1pt}
|
||||
\renewcommand{\arraystretch}{0.95}
|
||||
\resizebox{.9\linewidth}{!}{%
|
||||
\begin{tabular}{lc}
|
||||
\toprule
|
||||
\textbf{Variant} & \textbf{micro-avg$\uparrow$}\\
|
||||
\midrule
|
||||
RoSA & \textbf{85.9}\\
|
||||
\makecell[l]{RoSA-RoAEonly\\ (w/o DLS)} & 84.8\\
|
||||
\makecell[l]{RoSA-RoAE0.5\\ (w/ DLS \& RoAE)} & 85.6\\
|
||||
\makecell[l]{RoSA-Lr128\\ (w/ DLS, w/o RoAE)} & 83.9\\
|
||||
\makecell[l]{RoSA-Lr64\\ (w/ DLS, w/o RoAE)} & 80.7\\
|
||||
\bottomrule
|
||||
\end{tabular}}
|
||||
\captionof{table}{Ablation results of RoSA on Commonsense task using Qwen2.5-7B.}
|
||||
% \vspace{1pt}
|
||||
\label{tab:abla}
|
||||
\end{minipage}
|
||||
\hfill
|
||||
\begin{minipage}[t]{.45\columnwidth}
|
||||
\vspace{0.1pt}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{assets/impact_larger.pdf}
|
||||
\captionsetup{font=small}
|
||||
\captionof{figure}{Impact of layer selection ratio $k_{\text{ratio}}$.}
|
||||
\label{fig:sens_dls}
|
||||
\end{minipage}
|
||||
}
|
||||
\vspace{-9px}
|
||||
\end{figure}
|
||||
|
||||
|
||||
\section{Related Work}
|
||||
|
||||
\subsection{Parameter-Efficient Fine-Tuning}
|
||||
Parameter-Efficient Fine-Tuning (PEFT) aims to adapt LLMs to downstream tasks by tuning only a small subset of parameters, significantly reducing computational and memory costs.
|
||||
Adapter-based methods insert small trainable modules into Transformer layers, enabling effective task adaptation with minimal parameters~\cite{houlsby2019parameter}.
|
||||
Prompt-based approaches, such as prefix-tuning~\cite{li2021prefix}, freeze pretrained weights and optimize only task-specific prompts.
|
||||
Low-rank methods like LoRA~\cite{hu2021lora} and its variants, including DoRA~\cite{liu2024dora}, AdaLoRA~\cite{zhang2023adalora}, and VERA~\cite{kopiczko2023vera}, inject trainable low-rank matrices into pretrained weights to achieve efficient adaptation.
|
||||
Advanced structured-matrix methods, such as C3A~\cite{chen2024parameter}, and BONE~\cite{kang2024balancing}, introduce circular convolution or block affine into PEFT, further enhancing parameter efficiency through structured constraints.
|
||||
However, most existing methods apply adaptation uniformly across model components, often neglecting their distinct functional roles.
|
||||
|
||||
\subsection{Analysis of LLM Internals}
|
||||
Understanding the internal mechanics of LLMs is a growing research area that provides crucial insights for developing more principled and efficient methods.
|
||||
Early research shows that each FFN can be seen as a key-value memory~\cite{geva2021transformer}.
|
||||
Recent work provides evidence that attention mechanisms are crucial for retrieving relevant context and enabling dynamic reasoning, whereas the FFN layers are responsible for memorizing task-specific or factual content~\cite{dong2025attention}.
|
||||
RoPE in particular has been discussed in recent studies, that RoPE induces strong and dense activations in the low-frequency dimensions of attention states, and these activations are crucial for the LLMs' contextual understanding capabilities ~\cite{jin2025massive, barbero2024round}.
|
||||
Meanwhile, analyses of layer-wise behavior reveal that not all layers are equally important~\cite{belinkov2018evaluating}.
|
||||
These findings underscore that different submodules contribute unique and complementary functions in LLMs, which motivating our RoSA method. % with RoSA and DLS.
|
||||
|
||||
\section{Conclusion}
|
||||
In this work, we introduce RoPE-aware Selective Adaptation (RoSA), a novel PEFT framework for LLMs.
|
||||
RoSA explicitly leverages the frequency structure induced by RoPE by introducing a RoPE-aware Attention Enhancement (RoAE) module, which selectively enhances low-frequency attention components. Alongside, the Dynamic Layer Selection (DLS) strategy dynamically identifies and updates the most important layers based on LayerNorm gradients.
|
||||
This dual-level design enables more effective and targeted use of trainable parameters both within and across layers.
|
||||
Extensive experiments on fifteen commonsense and arithmetic QA datasets, covering multiple LLM families and model sizes, demonstrate that RoSA consistently outperforms baseline PEFT methods under comparable trainable parameters.
|
||||
|
||||
\appendix
|
||||
\section{Appendix}
|
||||
|
||||
\subsection{Quantitative Analysis of Layer Selection Behavior}
|
||||
To provide deeper insights into the effectiveness of the Dynamic Layer Selection (DLS) module, we visualize the frequency with which different layers are selected during training on the Commonsense QA task using the Qwen2.5-7B model. The results, illustrated in Fig.~\ref{fig:layer_sel}, clearly show significant heterogeneity across layers. Specifically, certain layers are consistently identified as more important and thus selected more frequently for adaptation, while others are rarely chosen. Additionally, the figure reveals that certain layers have very low selection frequencies, highlighting the effectiveness and necessity of our combined exploitation-exploration strategy in the DLS module to avoid neglecting potentially valuable but less frequently chosen layers. This observation empirically supports our motivation for dynamically allocating parameters and demonstrates that DLS successfully identifies layers that contribute more significantly to downstream task performance.
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
\includegraphics[width=0.75\linewidth]{assets/layer_sel.pdf}
|
||||
\caption{Visualization of layer selection frequency by the DLS module across the training process on Qwen2.5-7B.
|
||||
}
|
||||
\label{fig:layer_sel}
|
||||
\end{figure}
|
||||
|
||||
\subsection{Experimental and Implementation Details}
|
||||
All experiments reported in the main paper use a primary random seed of 2333, with additional experiments repeated using seeds 1000, 2000, 3000, and 4000 to assess statistical significance and reproducibility.
|
||||
For computational efficiency, training employs mixed-precision (BF16) and DeepSpeed optimization~\cite{rasley2020deepspeed} configured with ZeRO Stage 1. One training epoch typically takes approximately 1 hour on a single NVIDIA RTX 3090 GPU.
|
||||
A randomly selected validation set of 300 samples from the training data is used for checkpoint evaluation during training. Checkpoints with the lowest validation loss are chosen as the model for testing.
|
||||
|
||||
Detailed hyperparameter settings are summarized below:
|
||||
|
||||
\begin{itemize}
|
||||
\item Optimizer: AdamW
|
||||
\item Learning rate: $1\times10^{-3}$
|
||||
\item Learning rate scheduler: cosine
|
||||
\item Batch size: 2 (with gradient accumulation steps of 2)
|
||||
\item Warmup ratio: 0.05
|
||||
\item Max sequence length: 2048
|
||||
\item Low-frequency dimension ratio ($r_{\text{low}}$): 0.25
|
||||
\item Scaling factor ($\alpha$): 0.1
|
||||
\item Low-rank projection dimension: 128
|
||||
\item Dynamic layer selection ratio ($k_{\text{ratio}}$): 0.5
|
||||
\item Selection interval ($u$): every 40 steps
|
||||
\item Exploitation probability ($p_{\text{exploit}}$): 0.8
|
||||
\end{itemize}
|
||||
|
||||
|
||||
\subsubsection{Software and Environment}
|
||||
|
||||
The experiments were conducted using the following software packages and versions for reproducibility:
|
||||
|
||||
\begin{itemize}
|
||||
\item torch==2.1.2
|
||||
\item deepspeed==0.12.6
|
||||
\item numpy==1.26.4
|
||||
\item peft==0.16.0
|
||||
\item transformers==4.47.1
|
||||
\item tokenizers==0.21.2
|
||||
\item CUDA==12.1
|
||||
\end{itemize}
|
||||
|
||||
The hardware environment configuration is as follows:
|
||||
|
||||
\begin{itemize}[leftmargin=*]
|
||||
\item OS: Ubuntu 20.04 LTS
|
||||
\item CPU: Intel Xeon Silver 4214R
|
||||
\item GPU: NVIDIA GeForce RTX 3090
|
||||
\item Memory: 512GB RAM
|
||||
\end{itemize}
|
||||
Detailed implementation and datasets can be found in our codebase\footnote{\codelink}.
|
||||
|
||||
|
||||
\subsection{Baseline Implementation Details}
|
||||
|
||||
We briefly summarize the implementation details of baseline methods used in experiments:
|
||||
|
||||
\begin{itemize}[leftmargin=*]
|
||||
\item \textbf{LoRA}~\cite{hu2021lora}: Low-rank adaptation applied to all linear layers, with rank dimension $r=16$.
|
||||
\item \textbf{DoRA}~\cite{liu2024dora}: LoRA variant with decomposition-based weights applied to all linear layers, rank dimension $r=16$.
|
||||
\item \textbf{AdaLoRA}~\cite{zhang2023adalora}: Dynamically adjusts rank dimensions during training, applied to all linear layers. Initial rank set as 32, targeting an average rank of 16 over training steps.
|
||||
\item \textbf{BOFT}~\cite{liu2023parameter}: Uses block orthogonal transformations, with butterfly block size set to 4, two butterfly factors, and dropout rate of 0.1, targeting attention query and value projections.
|
||||
\item \textbf{VeRA}~\cite{kopiczko2023vera}: Shares low-rank matrices across modules with all linear layers, using the same rank dimension $r=16$.
|
||||
\item \textbf{C3A}~\cite{chen2024parameter}: Applies circular-convolution-based adapters to all linear layers, with block size 128.
|
||||
\item \textbf{BONE}~\cite{kang2024balancing}: Utilizes block affine transformations, targeting all linear layers, and rank dimension $r=16$.
|
||||
\item \textbf{LN Tuning}~\cite{zhao2023tuning}: Trains only LayerNorm parameters, keeping all other parameters frozen.
|
||||
\end{itemize}
|
||||
|
||||
|
||||
\subsection{Evaluation Protocol and Metrics}
|
||||
|
||||
\subsubsection{Generation Procedure.}
|
||||
All model outputs are generated using auto-regressive decoding via the \texttt{generate()} API in Hugging Face Transformers.
|
||||
We employ greedy decoding~(\texttt{do\_sample=False}), and set a maximum of 256 new tokens~(\texttt{max\_new\_tokens=256}).
|
||||
|
||||
Each input follows a unified instruction template, as shown below:
|
||||
\begin{tcolorbox}[boxrule=0.8pt]
|
||||
\textless s\textgreater Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
||||
|
||||
\#\#\# Instruction:\\
|
||||
\{instruction\}
|
||||
\\
|
||||
\\
|
||||
\#\#\# Response:
|
||||
\end{tcolorbox}
|
||||
|
||||
\subsubsection{Answer Extraction and Accuracy Calculation.}
|
||||
Results are calculated based on extracted predictions from generated outputs using task-specific regular expressions:
|
||||
|
||||
\begin{itemize}[leftmargin=*]
|
||||
\item \textit{Commonsense QA:} Extracted exact match answers (true/ false, solution/answer/ending options) and computed accuracy by direct matching against ground truth labels.
|
||||
\item \textit{Arithmetic QA:} Extracted numerical answers from output text (with absolute tolerance of $10^{-3}$) or alphabetic choices (A-E) for the AQuA dataset.
|
||||
\end{itemize}
|
||||
|
||||
All extraction and accuracy computation scripts are provided for reproducibility in our codebase.
|
||||
|
||||
|
||||
\subsection{Dataset Details}
|
||||
|
||||
\subsubsection{Training Datasets}
|
||||
We utilize two unified instruction-tuning datasets provided by LLM-Adapters~\cite{hu2023llm}:
|
||||
\begin{itemize}[leftmargin=*, topsep=0pt]
|
||||
\item \textbf{Commonsense15K} covers a wide range of commonsense reasoning questions. All examples are template-normalized into a consistent instruction format, supporting robust cross-task generalization.
|
||||
\item \textbf{Math10K} comprises diverse math word problems, each annotated with a step-by-step chain-of-thought solution and a final answer, enabling thorough evaluation of arithmetic reasoning under instruction-following settings.
|
||||
\end{itemize}
|
||||
The summary of dataset statistics is provided in Table~\ref{tab:dataset}.
|
||||
|
||||
\begin{table}[t]
|
||||
\centering
|
||||
\small
|
||||
\resizebox{0.95\linewidth}{!}{
|
||||
\renewcommand{\arraystretch}{1.01}
|
||||
\begin{tabular}{lccc}
|
||||
\toprule
|
||||
\textbf{Dataset} & \textbf{Samples} & \textbf{Total Tokens} & \textbf{Avg. Tokens/Sample} \\
|
||||
\midrule
|
||||
Commonsense15K & 15,119 & 1,778,782 & 117.65 \\
|
||||
Math10K & 9,919 & 2,273,016 & 229.16 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
}
|
||||
\caption{Statistics of the training datasets for commonsense and arithmetic QA tasks.
|
||||
}
|
||||
\label{tab:dataset}
|
||||
\end{table}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
\subsubsection{Evaluation Benchmarks}
|
||||
We evaluate model performance on a suite of well-established commonsense and arithmetic QA benchmarks, enabling comprehensive evaluation of both generalization and robustness.
|
||||
Detailed statistics for all evaluation datasets can be found in Table~\ref{tab:commonsense-datasets}~(Commonsense) and Table \ref{tab:arith-datasets}~(Arithmetic).
|
||||
|
||||
\noindent \textbf{a) Commonsense QA:}
|
||||
\begin{itemize}[leftmargin=1em]
|
||||
\item \textbf{BoolQ}~\cite{clark2019boolq}: BoolQ is a yes/no question answering dataset featuring naturally occurring, information-seeking queries and passage-based inference.
|
||||
\item \textbf{PIQA}~\cite{bisk2020piqa}: PIQA is a benchmark for physical commonsense reasoning, focused on practical everyday tasks with two candidate solutions.
|
||||
\item \textbf{SIQA}~\cite{sap2019socialiqa}: Social IQa is a multiple-choice benchmark that tests social and emotional commonsense reasoning in daily situations.
|
||||
\item \textbf{ARC-Challenge / ARC-Easy}~\cite{clark2018think}: The AI2 Reasoning Challenge (ARC) is a science question answering benchmark consisting of grade-school level, multiple-choice questions divided into Easy and Challenge subsets by difficulty.
|
||||
\item \textbf{OBQA}~\cite{mihaylov2018can}: OpenBookQA is a science question answering benchmark requiring multi-step reasoning over a provided set of core science facts.
|
||||
\item \textbf{HellaSwag}~\cite{zellers2019hellaswag}: HellaSwag is a natural language inference benchmark with adversarially-filtered continuations requiring robust commonsense reasoning.
|
||||
\item \textbf{WinoGrande}~\cite{sakaguchi2020winogrande}: WinoGrande is a binary fill-in-the-blank pronoun resolution benchmark designed to require advanced commonsense reasoning.
|
||||
\end{itemize}
|
||||
\begin{table}[t]
|
||||
\centering
|
||||
\small
|
||||
\resizebox{1\linewidth}{!}{
|
||||
\renewcommand{\arraystretch}{1.01}
|
||||
\begin{tabular}{lcc}
|
||||
\toprule
|
||||
\textbf{Dataset} & \textbf{Samples} & \textbf{Answer Format} \\
|
||||
\midrule
|
||||
BoolQ & 3,270 & true / false \\
|
||||
PIQA & 1,838 & solution1 / solution2 \\
|
||||
SIQA & 1,954 & answer1 / answer2 / answer3 \\
|
||||
ARC-Challenge & 1,172 & answer1 / answer2 / answer3 / answer4 \\
|
||||
ARC-Easy & 2,376 & answer1 / answer2 / answer3 / answer4 \\
|
||||
OBQA & 500 & answer1 / answer2 / answer3 / answer4 \\
|
||||
HellaSwag & 10,042 & ending1 / ending2 / ending3 / ending4 \\
|
||||
WinoGrande & 1,267 & option1 / option2 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
}
|
||||
\caption{Statistics of Commonsense QA Test Datasets.}
|
||||
\label{tab:commonsense-datasets}
|
||||
\end{table}
|
||||
|
||||
\noindent \textbf{b) Arithmetic QA:}
|
||||
\begin{itemize}[leftmargin=1em]
|
||||
\item \textbf{MultiArith}~\cite{roy2016solving}: MultiArith contains multi-step arithmetic word problems to evaluate a system's ability to handle complex reasoning chains.
|
||||
\item \textbf{GSM8K}~\cite{cobbe2021training}: GSM8K is a dataset of multiple linguistically diverse grade school math word problems, designed for benchmarking multi-step arithmetic reasoning with natural language solutions.
|
||||
\item \textbf{AddSub}~\cite{hosseini2014learning}: AddSub is a corpus of short word problems focused exclusively on addition and subtraction, used to assess basic arithmetic reasoning capabilities.
|
||||
\item \textbf{AQuA}~\cite{ling2017program}: AQuA is a large-scale dataset of algebraic word problems, each paired with natural language rationales to support step-by-step reasoning.
|
||||
\item \textbf{SingleEq}~\cite{koncel2015parsing}: SingleEq is a collection of multi-sentence algebraic word problems, emphasizing equation tree parsing and formal reasoning.
|
||||
\item \textbf{SVAMP}~\cite{patel2021nlp}: SVAMP is a challenge set constructed from elementary math word problems, aimed at evaluating a model's robustness to question sensitivity, structural variations, and reasoning challenges.
|
||||
\item \textbf{MAWPS}~\cite{koncel2016mawps}: MAWPS is a repository of multiple math word problems, offering a unified benchmark for evaluating models.
|
||||
\end{itemize}
|
||||
|
||||
\begin{table}[t]
|
||||
\centering
|
||||
\small
|
||||
\resizebox{0.8\linewidth}{!}{
|
||||
\renewcommand{\arraystretch}{0.95}
|
||||
\begin{tabular}{lcc}
|
||||
\toprule
|
||||
\textbf{Dataset} & \textbf{Samples} & \textbf{Answer Type} \\
|
||||
\midrule
|
||||
MultiArith & 600 & Numeric \\
|
||||
GSM8K & 1,319 & Numeric \\
|
||||
AddSub & 395 & Numeric \\
|
||||
AQuA & 254 & Multiple Choice (A--E) \\
|
||||
SingleEq & 508 & Numeric \\
|
||||
SVAMP & 1,000 & Numeric \\
|
||||
MAWPS & 238 & Numeric \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
}
|
||||
\caption{Statistics of Arithmetic QA Test Datasets.}
|
||||
\label{tab:arith-datasets}
|
||||
\end{table}
|
||||
|
||||
609
mypaper/CIKM2025_HyCAM.bib
Executable file
609
mypaper/CIKM2025_HyCAM.bib
Executable file
@@ -0,0 +1,609 @@
|
||||
@article{fu2025training,
|
||||
title={Training-free LLM Merging for Multi-task Learning},
|
||||
author={Fu, Zichuan and Wu, Xian and Wang, Yejing and Wang, Wanyu and Ye, Shanshan and Yin, Hongzhi and Chang, Yi and Zheng, Yefeng and Zhao, Xiangyu},
|
||||
journal={arXiv preprint arXiv:2506.12379},
|
||||
year={2025}
|
||||
}
|
||||
@inproceedings{wang2025put,
|
||||
title={Put Teacher in Student's Shoes: Cross-Distillation for Ultra-compact Model Compression Framework},
|
||||
author={Wang, Maolin and Chu, Jun and Xie, Sicong and Zang, Xiaoling and Zhao, Yao and Zhong, Wenliang and Zhao, Xiangyu},
|
||||
booktitle={Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining V. 2},
|
||||
pages={4975--4985},
|
||||
year={2025}
|
||||
}
|
||||
@article{wang2023large,
|
||||
title={Large multimodal model compression via efficient pruning and distillation at AntGroup},
|
||||
author={Wang, Maolin and Zhao, Yao and Liu, Jiajia and Chen, Jingdong and Zhuang, Chenyi and Gu, Jinjie and Guo, Ruocheng and Zhao, Xiangyu},
|
||||
journal={arXiv preprint arXiv:2312.05795},
|
||||
year={2023}
|
||||
}
|
||||
@inproceedings{liu2024moe,
|
||||
title={When moe meets llms: Parameter efficient fine-tuning for multi-task medical applications},
|
||||
author={Liu, Qidong and Wu, Xian and Zhao, Xiangyu and Zhu, Yuanshao and Xu, Derong and Tian, Feng and Zheng, Yefeng},
|
||||
booktitle={Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval},
|
||||
pages={1104--1114},
|
||||
year={2024}
|
||||
}
|
||||
@inproceedings{wang2025metalora,
|
||||
title={MetaLoRA: Tensor-Enhanced Adaptive Low-Rank Fine-Tuning},
|
||||
author={Wang, Maolin and Zhao, Xiangyu and Guo, Ruocheng and Wang, Junhui},
|
||||
booktitle={2025 IEEE 41st International Conference on Data Engineering (ICDE)},
|
||||
pages={4680--4684},
|
||||
year={2025},
|
||||
organization={IEEE}
|
||||
}
|
||||
@inproceedings{wang2024llm4msr,
|
||||
title={Llm4msr: An llm-enhanced paradigm for multi-scenario recommendation},
|
||||
author={Wang, Yuhao and Wang, Yichao and Fu, Zichuan and Li, Xiangyang and Wang, Wanyu and Ye, Yuyang and Zhao, Xiangyu and Guo, Huifeng and Tang, Ruiming},
|
||||
booktitle={Proceedings of the 33rd ACM International Conference on Information and Knowledge Management},
|
||||
pages={2472--2481},
|
||||
year={2024}
|
||||
}
|
||||
@article{luo2024moelora,
|
||||
title={Moelora: Contrastive learning guided mixture of experts on parameter-efficient fine-tuning for large language models},
|
||||
author={Luo, Tongxu and Lei, Jiahe and Lei, Fangyu and Liu, Weihao and He, Shizhu and Zhao, Jun and Liu, Kang},
|
||||
journal={arXiv preprint arXiv:2402.12851},
|
||||
year={2024}
|
||||
}
|
||||
@article{guo2024large,
|
||||
title={Large language model based multi-agents: A survey of progress and challenges},
|
||||
author={Guo, Taicheng and Chen, Xiuying and Wang, Yaqi and Chang, Ruidi and Pei, Shichao and Chawla, Nitesh V and Wiest, Olaf and Zhang, Xiangliang},
|
||||
journal={arXiv preprint arXiv:2402.01680},
|
||||
year={2024}
|
||||
}
|
||||
@article{zhao2023survey,
|
||||
title={A survey of large language models},
|
||||
author={Zhao, Wayne Xin and Zhou, Kun and Li, Junyi and Tang, Tianyi and Wang, Xiaolei and Hou, Yupeng and Min, Yingqian and Zhang, Beichen and Zhang, Junjie and Dong, Zican and others},
|
||||
journal={arXiv preprint arXiv:2303.18223},
|
||||
year={2023}
|
||||
}
|
||||
@article{gao2024higher,
|
||||
title={Higher layers need more lora experts},
|
||||
author={Gao, Chongyang and Chen, Kezhen and Rao, Jinmeng and Sun, Baochen and Liu, Ruibo and Peng, Daiyi and Zhang, Yawen and Guo, Xiaoyuan and Yang, Jie and Subrahmanian, VS},
|
||||
journal={arXiv preprint arXiv:2402.08562},
|
||||
year={2024}
|
||||
}
|
||||
@article{ji2023multi,
|
||||
title={Multi-factor spatio-temporal prediction based on graph decomposition learning},
|
||||
author={Ji, Jiahao and Wang, Jingyuan and Mou, Yu and Long, Cheng},
|
||||
journal={arXiv preprint arXiv:2310.10374},
|
||||
year={2023}
|
||||
}
|
||||
@article{ji2025seeing,
|
||||
title={Seeing the unseen: Learning basis confounder representations for robust traffic prediction},
|
||||
author={Ji, Jiahao and Zhang, Wentao and Wang, Jingyuan and Huang, Chao},
|
||||
year={2025}
|
||||
}
|
||||
@inproceedings{wang2025gtg,
|
||||
title={GTG: Generalizable Trajectory Generation Model for Urban Mobility},
|
||||
author={Wang, Jingyuan and Lin, Yujing and Li, Yudong},
|
||||
booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
|
||||
volume={39},
|
||||
number={1},
|
||||
pages={834--842},
|
||||
year={2025}
|
||||
}
|
||||
@inproceedings{cheng2025poi,
|
||||
title={Poi-enhancer: An llm-based semantic enhancement framework for poi representation learning},
|
||||
author={Cheng, Jiawei and Wang, Jingyuan and Zhang, Yichuan and Ji, Jiahao and Zhu, Yuanshao and Zhang, Zhibo and Zhao, Xiangyu},
|
||||
booktitle={Proceedings of the AAAI conference on artificial intelligence},
|
||||
volume={39},
|
||||
number={11},
|
||||
pages={11509--11517},
|
||||
year={2025}
|
||||
}
|
||||
@inproceedings{han2025bridging,
|
||||
title={Bridging traffic state and trajectory for dynamic road network and trajectory representation learning},
|
||||
author={Han, Chengkai and Wang, Jingyuan and Wang, Yongyao and Yu, Xie and Lin, Hao and Li, Chao and Wu, Junjie},
|
||||
booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
|
||||
volume={39},
|
||||
number={11},
|
||||
pages={11763--11771},
|
||||
year={2025}
|
||||
}
|
||||
|
||||
@inproceedings{yu2025bigcity,
|
||||
title={BIGCity: A universal spatiotemporal model for unified trajectory and traffic state data analysis},
|
||||
author={Yu, Xie and Wang, Jingyuan and Yang, Yifan and Huang, Qian and Qu, Ke},
|
||||
booktitle={2025 IEEE 41st International Conference on Data Engineering (ICDE)},
|
||||
pages={4455--4469},
|
||||
year={2025},
|
||||
organization={IEEE}
|
||||
}
|
||||
|
||||
@article{zhang2024veccity,
|
||||
title={VecCity: A taxonomy-guided library for map entity representation learning},
|
||||
author={Zhang, Wentao and Wang, Jingyuan and Yang, Yifan and others},
|
||||
journal={arXiv preprint arXiv:2411.00874},
|
||||
year={2024}
|
||||
}
|
||||
|
||||
@article{hettige2024airphynet,
|
||||
title={Airphynet: Harnessing physics-guided neural networks for air quality prediction},
|
||||
author={Hettige, Kethmi Hirushini and Ji, Jiahao and Xiang, Shili and Long, Cheng and Cong, Gao and Wang, Jingyuan},
|
||||
journal={arXiv preprint arXiv:2402.03784},
|
||||
year={2024}
|
||||
}
|
||||
|
||||
|
||||
@article{wang2023rethinking,
|
||||
title={Rethinking the evaluation for conversational recommendation in the era of large language models},
|
||||
author={Wang, Xiaolei and Tang, Xinyu and Zhao, Wayne Xin and Wang, Jingyuan and Wen, Ji-Rong},
|
||||
journal={arXiv preprint arXiv:2305.13112},
|
||||
year={2023}
|
||||
}
|
||||
|
||||
|
||||
@article{li2023web,
|
||||
title={The web can be your oyster for improving large language models},
|
||||
author={Li, Junyi and Tang, Tianyi and Zhao, Wayne Xin and Wang, Jingyuan and Nie, Jian-Yun and Wen, Ji-Rong},
|
||||
journal={arXiv preprint arXiv:2305.10998},
|
||||
year={2023}
|
||||
}
|
||||
@article{du2021gan,
|
||||
title={GAN-based anomaly detection for multivariate time series using polluted training set},
|
||||
author={Du, Bowen and Sun, Xuanxuan and Ye, Junchen and Cheng, Ke and Wang, Jingyuan and Sun, Leilei},
|
||||
journal={IEEE Transactions on Knowledge and Data Engineering},
|
||||
volume={35},
|
||||
number={12},
|
||||
pages={12208--12219},
|
||||
year={2021},
|
||||
publisher={IEEE}
|
||||
}
|
||||
|
||||
@article{li2023e4srec,
|
||||
title={E4srec: An elegant effective efficient extensible solution of large language models for sequential recommendation},
|
||||
author={Li, Xinhang and Chen, Chong and Zhao, Xiangyu and Zhang, Yong and Xing, Chunxiao},
|
||||
journal={arXiv preprint arXiv:2312.02443},
|
||||
year={2023}
|
||||
}
|
||||
@article{fu2025sliding,
|
||||
title={Sliding Window Attention Training for Efficient Large Language Models},
|
||||
author={Fu, Zichuan and Song, Wentao and Wang, Yejing and Wu, Xian and Zheng, Yefeng and Zhang, Yingying and Xu, Derong and Wei, Xuetao and Xu, Tong and Zhao, Xiangyu},
|
||||
journal={arXiv preprint arXiv:2502.18845},
|
||||
year={2025}
|
||||
}
|
||||
|
||||
@article{wang2023multi,
|
||||
title={Multi-task deep recommender systems: A survey},
|
||||
author={Wang, Yuhao and Lam, Ha Tsz and Wong, Yi and Liu, Ziru and Zhao, Xiangyu and Wang, Yichao and Chen, Bo and Guo, Huifeng and Tang, Ruiming},
|
||||
journal={arXiv preprint arXiv:2302.03525},
|
||||
year={2023}
|
||||
}
|
||||
@inproceedings{liu2023multi,
|
||||
title={Multi-task recommendations with reinforcement learning},
|
||||
author={Liu, Ziru and Tian, Jiejie and Cai, Qingpeng and Zhao, Xiangyu and Gao, Jingtong and Liu, Shuchang and Chen, Dayou and He, Tonghao and Zheng, Dong and Jiang, Peng and others},
|
||||
booktitle={Proceedings of the ACM web conference 2023},
|
||||
pages={1273--1282},
|
||||
year={2023}
|
||||
}
|
||||
@inproceedings{liu2025multi,
|
||||
title={Multi-task Offline Reinforcement Learning for Online Advertising in Recommender Systems},
|
||||
author={Liu, Langming and Wang, Wanyu and Zhang, Chi and Li, Bo and Yin, Hongzhi and Wei, Xuetao and Su, Wenbo and Zheng, Bo and Zhao, Xiangyu},
|
||||
booktitle={Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining V. 2},
|
||||
pages={4635--4646},
|
||||
year={2025}
|
||||
}
|
||||
|
||||
|
||||
@inproceedings{he2015delving,
|
||||
title={Delving deep into rectifiers: Surpassing human-level performance on imagenet classification},
|
||||
author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
|
||||
booktitle={Proceedings of the IEEE international conference on computer vision},
|
||||
pages={1026--1034},
|
||||
year={2015}
|
||||
}
|
||||
@article{guo2025nlora,
|
||||
title={NLoRA: Nystr$\backslash$" om-Initiated Low-Rank Adaptation for Large Language Models},
|
||||
author={Guo, Chenlu and Wu, Yuan and Chang, Yi},
|
||||
journal={arXiv preprint arXiv:2502.14482},
|
||||
year={2025}
|
||||
}
|
||||
@article{elfwing2018sigmoid,
|
||||
title={Sigmoid-weighted linear units for neural network function approximation in reinforcement learning},
|
||||
author={Elfwing, Stefan and Uchibe, Eiji and Doya, Kenji},
|
||||
journal={Neural networks},
|
||||
volume={107},
|
||||
pages={3--11},
|
||||
year={2018},
|
||||
publisher={Elsevier}
|
||||
}
|
||||
@article{vaswani2017attention,
|
||||
title={Attention is all you need},
|
||||
author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
|
||||
journal={Advances in neural information processing systems},
|
||||
volume={30},
|
||||
year={2017}
|
||||
}
|
||||
@article{ba2016layer,
|
||||
title={Layer normalization},
|
||||
author={Ba, Jimmy Lei and Kiros, Jamie Ryan and Hinton, Geoffrey E},
|
||||
journal={arXiv preprint arXiv:1607.06450},
|
||||
year={2016}
|
||||
}
|
||||
@article{jin2025massive,
|
||||
title={Massive Values in Self-Attention Modules are the Key to Contextual Knowledge Understanding},
|
||||
author={Jin, Mingyu and Mei, Kai and Xu, Wujiang and Sun, Mingjie and Tang, Ruixiang and Du, Mengnan and Liu, Zirui and Zhang, Yongfeng},
|
||||
journal={arXiv preprint arXiv:2502.01563},
|
||||
year={2025}
|
||||
}
|
||||
@inproceedings{geva2021transformer,
|
||||
title={Transformer Feed-Forward Layers Are Key-Value Memories},
|
||||
author={Geva, Mor and Schuster, Roei and Berant, Jonathan and Levy, Omer},
|
||||
booktitle={Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing},
|
||||
pages={5484--5495},
|
||||
year={2021}
|
||||
}
|
||||
@article{team2023gemini,
|
||||
title={Gemini: a family of highly capable multimodal models},
|
||||
author={Team, Gemini and Anil, Rohan and Borgeaud, Sebastian and Alayrac, Jean-Baptiste and Yu, Jiahui and Soricut, Radu and Schalkwyk, Johan and Dai, Andrew M and Hauth, Anja and Millican, Katie and others},
|
||||
journal={arXiv preprint arXiv:2312.11805},
|
||||
year={2023}
|
||||
}
|
||||
@article{liu2023moelora,
|
||||
title={Moelora: An moe-based parameter efficient fine-tuning method for multi-task medical applications},
|
||||
author={Liu, Qidong and Wu, Xian and Zhao, Xiangyu and Zhu, Yuanshao and Xu, Derong and Tian, Feng and Zheng, Yefeng},
|
||||
journal={arXiv preprint arXiv:2310.18339},
|
||||
year={2023}
|
||||
}
|
||||
@article{wang2023multilora,
|
||||
title={Multilora: Democratizing lora for better multi-task learning},
|
||||
author={Wang, Yiming and Lin, Yu and Zeng, Xiaodong and Zhang, Guannan},
|
||||
journal={arXiv preprint arXiv:2311.11501},
|
||||
year={2023}
|
||||
}
|
||||
@article{liu2021p,
|
||||
title={P-tuning v2: Prompt tuning can be comparable to fine-tuning universally across scales and tasks},
|
||||
author={Liu, Xiao and Ji, Kaixuan and Fu, Yicheng and Tam, Weng Lam and Du, Zhengxiao and Yang, Zhilin and Tang, Jie},
|
||||
journal={arXiv preprint arXiv:2110.07602},
|
||||
year={2021}
|
||||
}
|
||||
@article{brown2020language,
|
||||
title={Language models are few-shot learners},
|
||||
author={Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and others},
|
||||
journal={Advances in neural information processing systems},
|
||||
volume={33},
|
||||
pages={1877--1901},
|
||||
year={2020}
|
||||
}
|
||||
@article{liu2021conflict,
|
||||
title={Conflict-averse gradient descent for multi-task learning},
|
||||
author={Liu, Bo and Liu, Xingchao and Jin, Xiaojie and Stone, Peter and Liu, Qiang},
|
||||
journal={Advances in Neural Information Processing Systems},
|
||||
volume={34},
|
||||
pages={18878--18890},
|
||||
year={2021}
|
||||
}
|
||||
@article{navon2022multi,
|
||||
title={Multi-task learning as a bargaining game},
|
||||
author={Navon, Aviv and Shamsian, Aviv and Achituve, Idan and Maron, Haggai and Kawaguchi, Kenji and Chechik, Gal and Fetaya, Ethan},
|
||||
journal={arXiv preprint arXiv:2202.01017},
|
||||
year={2022}
|
||||
}
|
||||
|
||||
@inproceedings{wang2023wavelet,
|
||||
title={WHEN: A Wavelet-DTW hybrid attention network for heterogeneous time series analysis},
|
||||
author={Wang, Jingyuan and Yang, Chen and Jiang, Xiaohan and Wu, Junjie},
|
||||
booktitle={Proceedings of the 29th ACM SIGKDD conference on knowledge discovery and data mining},
|
||||
pages={2361--2373},
|
||||
year={2023}
|
||||
}
|
||||
|
||||
@article{sun2025stronger,
|
||||
title={A Stronger Mixture of Low-Rank Experts for Fine-Tuning Foundation Models},
|
||||
author={Sun, Mengyang and Wang, Yihao and Feng, Tao and Zhang, Dan and Zhu, Yifan and Tang, Jie},
|
||||
journal={arXiv preprint arXiv:2502.15828},
|
||||
year={2025}
|
||||
}
|
||||
@article{pfeiffer2020mad,
|
||||
title={Mad-x: An adapter-based framework for multi-task cross-lingual transfer},
|
||||
author={Pfeiffer, Jonas and Vuli{\'c}, Ivan and Gurevych, Iryna and Ruder, Sebastian},
|
||||
journal={arXiv preprint arXiv:2005.00052},
|
||||
year={2020}
|
||||
}
|
||||
@article{raffel2020exploring,
|
||||
title={Exploring the limits of transfer learning with a unified text-to-text transformer},
|
||||
author={Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J},
|
||||
journal={Journal of machine learning research},
|
||||
volume={21},
|
||||
number={140},
|
||||
pages={1--67},
|
||||
year={2020}
|
||||
}
|
||||
@article{zaken2021bitfit,
|
||||
title={Bitfit: Simple parameter-efficient fine-tuning for transformer-based masked language-models},
|
||||
author={Zaken, Elad Ben and Ravfogel, Shauli and Goldberg, Yoav},
|
||||
journal={arXiv preprint arXiv:2106.10199},
|
||||
year={2021}
|
||||
}
|
||||
@inproceedings{papineni2002bleu,
|
||||
title={Bleu: a method for automatic evaluation of machine translation},
|
||||
author={Papineni, Kishore and Roukos, Salim and Ward, Todd and Zhu, Wei-Jing},
|
||||
booktitle={Proceedings of the 40th annual meeting of the Association for Computational Linguistics},
|
||||
pages={311--318},
|
||||
year={2002}
|
||||
}
|
||||
@inproceedings{lin2004rouge,
|
||||
title={Rouge: A package for automatic evaluation of summaries},
|
||||
author={Lin, Chin-Yew},
|
||||
booktitle={Text summarization branches out},
|
||||
pages={74--81},
|
||||
year={2004}
|
||||
}
|
||||
@article{jang2016categorical,
|
||||
title={Categorical reparameterization with gumbel-softmax},
|
||||
author={Jang, Eric and Gu, Shixiang and Poole, Ben},
|
||||
journal={arXiv preprint arXiv:1611.01144},
|
||||
year={2016}
|
||||
}
|
||||
@article{yu2020gradient,
|
||||
title={Gradient surgery for multi-task learning},
|
||||
author={Yu, Tianhe and Kumar, Saurabh and Gupta, Abhishek and Levine, Sergey and Hausman, Karol and Finn, Chelsea},
|
||||
journal={Advances in Neural Information Processing Systems},
|
||||
volume={33},
|
||||
pages={5824--5836},
|
||||
year={2020}
|
||||
}
|
||||
@article{renduchintala2023tied,
|
||||
title={Tied-lora: Enhacing parameter efficiency of lora with weight tying},
|
||||
author={Renduchintala, Adithya and Konuk, Tugrul and Kuchaiev, Oleksii},
|
||||
journal={arXiv preprint arXiv:2311.09578},
|
||||
year={2023}
|
||||
}
|
||||
@inproceedings{kwon2023efficient,
|
||||
title={Efficient memory management for large language model serving with pagedattention},
|
||||
author={Kwon, Woosuk and Li, Zhuohan and Zhuang, Siyuan and Sheng, Ying and Zheng, Lianmin and Yu, Cody Hao and Gonzalez, Joseph and Zhang, Hao and Stoica, Ion},
|
||||
booktitle={Proceedings of the 29th Symposium on Operating Systems Principles},
|
||||
pages={611--626},
|
||||
year={2023}
|
||||
}
|
||||
|
||||
@article{dai2024deepseekmoe,
|
||||
title={Deepseekmoe: Towards ultimate expert specialization in mixture-of-experts language models},
|
||||
author={Dai, Damai and Deng, Chengqi and Zhao, Chenggang and Xu, RX and Gao, Huazuo and Chen, Deli and Li, Jiashi and Zeng, Wangding and Yu, Xingkai and Wu, Y and others},
|
||||
journal={arXiv preprint arXiv:2401.06066},
|
||||
year={2024}
|
||||
}
|
||||
@inproceedings{houlsby2019parameter,
|
||||
title={Parameter-efficient transfer learning for NLP},
|
||||
author={Houlsby, Neil and Giurgiu, Andrei and Jastrzebski, Stanislaw and Morrone, Bruna and De Laroussilhe, Quentin and Gesmundo, Andrea and Attariyan, Mona and Gelly, Sylvain},
|
||||
booktitle={International conference on machine learning},
|
||||
pages={2790--2799},
|
||||
year={2019},
|
||||
organization={PMLR}
|
||||
}
|
||||
@article{guo2025deepseek,
|
||||
title={Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning},
|
||||
author={Guo, Daya and Yang, Dejian and Zhang, Haowei and Song, Junxiao and Zhang, Ruoyu and Xu, Runxin and Zhu, Qihao and Ma, Shirong and Wang, Peiyi and Bi, Xiao and others},
|
||||
journal={arXiv preprint arXiv:2501.12948},
|
||||
year={2025}
|
||||
}
|
||||
@article{shazeer2017outrageously,
|
||||
title={Outrageously large neural networks: The sparsely-gated mixture-of-experts layer},
|
||||
author={Shazeer, Noam and Mirhoseini, Azalia and Maziarz, Krzysztof and Davis, Andy and Le, Quoc and Hinton, Geoffrey and Dean, Jeff},
|
||||
journal={arXiv preprint arXiv:1701.06538},
|
||||
year={2017}
|
||||
}
|
||||
@inproceedings{rajbhandari2022deepspeed,
|
||||
title={Deepspeed-moe: Advancing mixture-of-experts inference and training to power next-generation ai scale},
|
||||
author={Rajbhandari, Samyam and Li, Conglong and Yao, Zhewei and Zhang, Minjia and Aminabadi, Reza Yazdani and Awan, Ammar Ahmad and Rasley, Jeff and He, Yuxiong},
|
||||
booktitle={International conference on machine learning},
|
||||
pages={18332--18346},
|
||||
year={2022},
|
||||
organization={PMLR}
|
||||
}
|
||||
@article{zhang2023instruction,
|
||||
title={Instruction tuning for large language models: A survey},
|
||||
author={Zhang, Shengyu and Dong, Linfeng and Li, Xiaoya and Zhang, Sen and Sun, Xiaofei and Wang, Shuhe and Li, Jiwei and Hu, Runyi and Zhang, Tianwei and Wu, Fei and others},
|
||||
journal={arXiv preprint arXiv:2308.10792},
|
||||
year={2023}
|
||||
}
|
||||
@article{han2024parameter,
|
||||
title={Parameter-efficient fine-tuning for large models: A comprehensive survey},
|
||||
author={Han, Zeyu and Gao, Chao and Liu, Jinyang and Zhang, Jeff and Zhang, Sai Qian},
|
||||
journal={arXiv preprint arXiv:2403.14608},
|
||||
year={2024}
|
||||
}
|
||||
@article{pfeiffer2020adapterfusion,
|
||||
title={Adapterfusion: Non-destructive task composition for transfer learning},
|
||||
author={Pfeiffer, Jonas and Kamath, Aishwarya and R{\"u}ckl{\'e}, Andreas and Cho, Kyunghyun and Gurevych, Iryna},
|
||||
journal={arXiv preprint arXiv:2005.00247},
|
||||
year={2020}
|
||||
}
|
||||
@article{pfeiffer2020adapterhub,
|
||||
title={Adapterhub: A framework for adapting transformers},
|
||||
author={Pfeiffer, Jonas and R{\"u}ckl{\'e}, Andreas and Poth, Clifton and Kamath, Aishwarya and Vuli{\'c}, Ivan and Ruder, Sebastian and Cho, Kyunghyun and Gurevych, Iryna},
|
||||
journal={arXiv preprint arXiv:2007.07779},
|
||||
year={2020}
|
||||
}
|
||||
@article{lialin2023scaling,
|
||||
title={Scaling down to scale up: A guide to parameter-efficient fine-tuning},
|
||||
author={Lialin, Vladislav and Deshpande, Vijeta and Rumshisky, Anna},
|
||||
journal={arXiv preprint arXiv:2303.15647},
|
||||
year={2023}
|
||||
}
|
||||
@article{li2021prefix,
|
||||
title={Prefix-tuning: Optimizing continuous prompts for generation},
|
||||
author={Li, Xiang Lisa and Liang, Percy},
|
||||
journal={arXiv preprint arXiv:2101.00190},
|
||||
year={2021}
|
||||
}
|
||||
@article{lu2023uniadapter,
|
||||
title={Uniadapter: Unified parameter-efficient transfer learning for cross-modal modeling},
|
||||
author={Lu, Haoyu and Huo, Yuqi and Yang, Guoxing and Lu, Zhiwu and Zhan, Wei and Tomizuka, Masayoshi and Ding, Mingyu},
|
||||
journal={arXiv preprint arXiv:2302.06605},
|
||||
year={2023}
|
||||
}
|
||||
|
||||
@article{fedus2022switch,
|
||||
title={Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity},
|
||||
author={Fedus, William and Zoph, Barret and Shazeer, Noam},
|
||||
journal={Journal of Machine Learning Research},
|
||||
volume={23},
|
||||
number={120},
|
||||
pages={1--39},
|
||||
year={2022}
|
||||
}
|
||||
@article{lepikhin2020gshard,
|
||||
title={Gshard: Scaling giant models with conditional computation and automatic sharding},
|
||||
author={Lepikhin, Dmitry and Lee, HyoukJoong and Xu, Yuanzhong and Chen, Dehao and Firat, Orhan and Huang, Yanping and Krikun, Maxim and Shazeer, Noam and Chen, Zhifeng},
|
||||
journal={arXiv preprint arXiv:2006.16668},
|
||||
year={2020}
|
||||
}
|
||||
|
||||
@inproceedings{dou2024loramoe,
|
||||
title={LoRAMoE: Alleviating world knowledge forgetting in large language models via MoE-style plugin},
|
||||
author={Dou, Shihan and Zhou, Enyu and Liu, Yan and Gao, Songyang and Shen, Wei and Xiong, Limao and Zhou, Yuhao and Wang, Xiao and Xi, Zhiheng and Fan, Xiaoran and others},
|
||||
booktitle={Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
|
||||
pages={1932--1945},
|
||||
year={2024}
|
||||
}
|
||||
@article{zhang2023adalora,
|
||||
title={AdaLoRA: Adaptive budget allocation for parameter-efficient fine-tuning},
|
||||
author={Zhang, Qingru and Chen, Minshuo and Bukharin, Alexander and Karampatziakis, Nikos and He, Pengcheng and Cheng, Yu and Chen, Weizhu and Zhao, Tuo},
|
||||
journal={arXiv preprint arXiv:2303.10512},
|
||||
year={2023}
|
||||
}
|
||||
@article{liu2024dora,
|
||||
title={Dora: Weight-decomposed low-rank adaptation},
|
||||
author={Liu, Shih-Yang and Wang, Chien-Yi and Yin, Hongxu and Molchanov, Pavlo and Wang, Yu-Chiang Frank and Cheng, Kwang-Ting and Chen, Min-Hung},
|
||||
journal={arXiv preprint arXiv:2402.09353},
|
||||
year={2024}
|
||||
}
|
||||
@article{hu2021lora,
|
||||
title={Lora: Low-rank adaptation of large language models},
|
||||
author={Hu, Edward J and Shen, Yelong and Wallis, Phillip and Allen-Zhu, Zeyuan and Li, Yuanzhi and Wang, Shean and Wang, Lu and Chen, Weizhu},
|
||||
journal={arXiv preprint arXiv:2106.09685},
|
||||
year={2021}
|
||||
}
|
||||
|
||||
@article{achiam2023gpt,
|
||||
title={Gpt-4 technical report},
|
||||
author={Achiam, Josh and Adler, Steven and Agarwal, Sandhini and Ahmad, Lama and Akkaya, Ilge and Aleman, Florencia Leoni and Almeida, Diogo and Altenschmidt, Janko and Altman, Sam and Anadkat, Shyamal and others},
|
||||
journal={arXiv preprint arXiv:2303.08774},
|
||||
year={2023}
|
||||
}
|
||||
@article{jaszczur2021sparse,
|
||||
title={Sparse is enough in scaling transformers},
|
||||
author={Jaszczur, Sebastian and Chowdhery, Aakanksha and Mohiuddin, Afroz and Kaiser, Lukasz and Gajewski, Wojciech and Michalewski, Henryk and Kanerva, Jonni},
|
||||
journal={Advances in Neural Information Processing Systems},
|
||||
volume={34},
|
||||
pages={9895--9907},
|
||||
year={2021}
|
||||
}
|
||||
@inproceedings{standley2020tasks,
|
||||
title={Which tasks should be learned together in multi-task learning?},
|
||||
author={Standley, Trevor and Zamir, Amir and Chen, Dawn and Guibas, Leonidas and Malik, Jitendra and Savarese, Silvio},
|
||||
booktitle={International conference on machine learning},
|
||||
pages={9120--9132},
|
||||
year={2020},
|
||||
organization={PMLR}
|
||||
}
|
||||
@article{cai2024survey,
|
||||
title={A survey on mixture of experts},
|
||||
author={Cai, Weilin and Jiang, Juyong and Wang, Fan and Tang, Jing and Kim, Sunghun and Huang, Jiayi},
|
||||
journal={arXiv preprint arXiv:2407.06204},
|
||||
year={2024}
|
||||
}
|
||||
@article{karimi2021compacter,
|
||||
title={Compacter: Efficient low-rank hypercomplex adapter layers},
|
||||
author={Karimi Mahabadi, Rabeeh and Henderson, James and Ruder, Sebastian},
|
||||
journal={Advances in Neural Information Processing Systems},
|
||||
volume={34},
|
||||
pages={1022--1035},
|
||||
year={2021}
|
||||
}
|
||||
@article{bommasani2021opportunities,
|
||||
title={On the opportunities and risks of foundation models},
|
||||
author={Bommasani, Rishi and Hudson, Drew A and Adeli, Ehsan and Altman, Russ and Arora, Simran and von Arx, Sydney and Bernstein, Michael S and Bohg, Jeannette and Bosselut, Antoine and Brunskill, Emma and others},
|
||||
journal={arXiv preprint arXiv:2108.07258},
|
||||
year={2021}
|
||||
}
|
||||
@article{pan2024lisa,
|
||||
title={LISA: Layerwise Importance Sampling for Memory-Efficient Large Language Model Fine-Tuning},
|
||||
author={Pan, Rui and Liu, Xiang and Diao, Shizhe and Pi, Renjie and Zhang, Jipeng and Han, Chi and Zhang, Tong},
|
||||
journal={arXiv preprint arXiv:2403.17919},
|
||||
year={2024}
|
||||
}
|
||||
@article{feng2024mixture,
|
||||
title={Mixture-of-loras: An efficient multitask tuning for large language models},
|
||||
author={Feng, Wenfeng and Hao, Chuzhan and Zhang, Yuewei and Han, Yu and Wang, Hao},
|
||||
journal={arXiv preprint arXiv:2403.03432},
|
||||
year={2024}
|
||||
}
|
||||
@article{lester2021power,
|
||||
title={The power of scale for parameter-efficient prompt tuning},
|
||||
author={Lester, Brian and Al-Rfou, Rami and Constant, Noah},
|
||||
journal={arXiv preprint arXiv:2104.08691},
|
||||
year={2021}
|
||||
}
|
||||
@article{zhou2024lima,
|
||||
title={Lima: Less is more for alignment},
|
||||
author={Zhou, Chunting and Liu, Pengfei and Xu, Puxin and Iyer, Srinivasan and Sun, Jiao and Mao, Yuning and Ma, Xuezhe and Efrat, Avia and Yu, Ping and Yu, Lili and others},
|
||||
journal={Advances in Neural Information Processing Systems},
|
||||
volume={36},
|
||||
year={2024}
|
||||
}
|
||||
@article{wei2021finetuned,
|
||||
title={Finetuned language models are zero-shot learners},
|
||||
author={Wei, Jason and Bosma, Maarten and Zhao, Vincent Y and Guu, Kelvin and Yu, Adams Wei and Lester, Brian and Du, Nan and Dai, Andrew M and Le, Quoc V},
|
||||
journal={arXiv preprint arXiv:2109.01652},
|
||||
year={2021}
|
||||
}
|
||||
|
||||
@article{brynjolfsson2025generative,
|
||||
title={Generative AI at work},
|
||||
author={Brynjolfsson, Erik and Li, Danielle and Raymond, Lindsey},
|
||||
journal={The Quarterly Journal of Economics},
|
||||
pages={qjae044},
|
||||
year={2025},
|
||||
publisher={Oxford University Press}
|
||||
}
|
||||
@Misc{peft,
|
||||
title = {PEFT: State-of-the-art Parameter-Efficient Fine-Tuning methods},
|
||||
author = {Sourab Mangrulkar and Sylvain Gugger and Lysandre Debut and Younes Belkada and Sayak Paul and Benjamin Bossan},
|
||||
howpublished = {\url{https://github.com/huggingface/peft}},
|
||||
year = {2022}
|
||||
}
|
||||
@article{li2023chatdoctor,
|
||||
title={ChatDoctor: A Medical Chat Model Fine-Tuned on a Large Language Model Meta-AI (LLaMA) Using Medical Domain Knowledge},
|
||||
author={Li, Yunxiang and Li, Zihan and Zhang, Kai and Dan, Ruilong and Jiang, Steve and Zhang, You},
|
||||
journal={Cureus},
|
||||
volume={15},
|
||||
number={6},
|
||||
year={2023},
|
||||
publisher={Cureus}
|
||||
}
|
||||
@online{DatabricksBlog2023DollyV2,
|
||||
author = {Mike Conover and Matt Hayes and Ankit Mathur and Jianwei Xie and Jun Wan and Sam Shah and Ali Ghodsi and Patrick Wendell and Matei Zaharia and Reynold Xin},
|
||||
title = {Free Dolly: Introducing the World's First Truly Open Instruction-Tuned LLM},
|
||||
year = {2023},
|
||||
url = {https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm},
|
||||
urldate = {2023-06-30}
|
||||
}
|
||||
@inproceedings{nakano2021webgpt,
|
||||
author = {Reiichiro Nakano and Jacob Hilton and Suchir Balaji and Jeff Wu and Long Ouyang and Christina Kim and Christopher Hesse and Shantanu Jain and Vineet Kosaraju and William Saunders and Xu Jiang and Karl Cobbe and Tyna Eloundou and Gretchen Krueger and Kevin Button and Matthew Knight and Benjamin Chess and John Schulman},
|
||||
title = {WebGPT: Browser-assisted question-answering with human feedback},
|
||||
booktitle = {arXiv},
|
||||
year = 2021,
|
||||
}
|
||||
@inproceedings{zhang2023automatic,
|
||||
title={Automatic Chain of Thought Prompting in Large Language Models},
|
||||
author={Zhang, Zhuosheng and Zhang, Aston and Li, Mu and Smola, Alex},
|
||||
booktitle={The Eleventh International Conference on Learning Representations (ICLR 2023)},
|
||||
year={2023}
|
||||
}
|
||||
@misc{codealpaca,
|
||||
author = {Sahil Chaudhary},
|
||||
title = {Code Alpaca: An Instruction-following LLaMA model for code generation},
|
||||
year = {2023},
|
||||
publisher = {GitHub},
|
||||
journal = {GitHub repository},
|
||||
howpublished = {\url{https://github.com/sahil280114/codealpaca}},
|
||||
}
|
||||
@article{zhao2024hypermoe,
|
||||
title={HyperMoE: Towards Better Mixture of Experts via Transferring Among Experts},
|
||||
author={Zhao, Hao and Qiu, Zihan and Wu, Huijia and Wang, Zili and He, Zhaofeng and Fu, Jie},
|
||||
journal={arXiv preprint arXiv:2402.12656},
|
||||
year={2024}
|
||||
}
|
||||
587
mypaper/CIKM2025_HyCAM.tex
Executable file
587
mypaper/CIKM2025_HyCAM.tex
Executable file
@@ -0,0 +1,587 @@
|
||||
\title{Contextual Attention Modulation: Towards Efficient Multi-Task Adaptation in Large Language Models}
|
||||
\begin{abstract}
|
||||
Large Language Models (LLMs) possess remarkable generalization capabilities but struggle with multi-task adaptation, particularly in balancing knowledge retention with task-specific specialization.
|
||||
Conventional fine-tuning methods suffer from catastrophic forgetting and substantial resource consumption, while existing parameter-efficient methods perform suboptimally in complex multi-task scenarios.
|
||||
To address this, we propose Contextual Attention Modulation (CAM), a novel mechanism that dynamically modulates the representations of self-attention modules in LLMs. CAM enhances task-specific features while preserving general knowledge, thereby facilitating more effective and efficient adaptation.
|
||||
For effective multi-task adaptation, CAM is integrated into our Hybrid Contextual Attention Modulation (HyCAM) framework, which combines a shared, full-parameter CAM module with multiple specialized, lightweight CAM modules, enhanced by a dynamic routing strategy for adaptive knowledge fusion.
|
||||
Extensive experiments on heterogeneous tasks, including question answering, code generation, and logical reasoning, demonstrate that our approach significantly outperforms existing approaches, achieving an average performance improvement of 3.65\%. The implemented code and data are available to ease reproducibility.\footnote{https://github.com/Applied-Machine-Learning-Lab/HyCAM}
|
||||
\end{abstract}
|
||||
|
||||
\input{0_misc}
|
||||
|
||||
\section{Introduction} \label{sec:intro}
|
||||
Large Language Models (LLMs) have demonstrated remarkable capabilities by their extensive general knowledge and powerful reasoning abilities~\cite{achiam2023gpt, team2023gemini}.
|
||||
More than just a conversation, these models are increasingly proving invaluable as core components in advanced information retrieval~\cite{li2023e4srec, li2023web}, critical decision-making systems~\cite{brynjolfsson2025generative, wang2023rethinking}, and spatiotemporal applications~\cite{cheng2025poi, zhang2024veccity}.
|
||||
The success has led to increasing demand for adapting such models to specialized domains and, more importantly, to handle multiple diverse tasks simultaneously.
|
||||
This capability is essential for effective deployment in real-world applications~\cite{bommasani2021opportunities, yu2025bigcity, ji2025seeing}.
|
||||
|
||||
Supervised Fine-Tuning (SFT), a widely adopted adaptation approach, involves further tuning a pre-trained model on task-specific instruction data~\cite{wei2021finetuned}.
|
||||
However, achieving effective adaptation remains significant challenges.
|
||||
Conventional full parameter fine-tuning, a common SFT implementation that updates all parameters, needs to achieve effective adaptation while preserving foundational capabilities.
|
||||
The training process on a narrow task-specific dataset can significantly change the model's pre-trained weights, leading to catastrophic forgetting~\cite{lester2021power}.
|
||||
Furthermore, such an approach typically demands substantial computational resources.
|
||||
Such limitations hinder its applicability in many practical scenarios, especially in multi-task settings~\cite{wang2023multi, fu2025training}, where models must balance between generalization and specialization.
|
||||
To address these limitations, various Parameter-Efficient Fine-Tuning (PEFT) techniques have been proposed. These approaches adapt pre-trained LLMs to new tasks by updating only a small number of trainable parameters while leaving the backbone model unchanged, thereby reducing computational cost and overfitting risks~\cite{han2024parameter}.
|
||||
Common PEFT strategies include adapter-based methods~\cite{houlsby2019parameter} that insert lightweight trainable modules, prompt-based methods such as Prefix Tuning~\cite{li2021prefix} that modify input representations, and reparameterization methods like Low-Rank Adaptation (LoRA)~\cite{hu2021lora} and its variants. LoRA, a widely utilized PEFT method, employs low-rank decomposition to weight updates, making it both efficient and effective.
|
||||
|
||||
However, these methods face limitations in complex multi-task scenarios due to their limited generalization and representational capacity across diverse tasks and potential interference when adapting to multiple objectives simultaneously~\cite{yu2020gradient, liu2021conflict, navon2022multi}.
|
||||
Specifically for low-rank reparameterization approaches like LoRA, the low-rank adaptability may restrict model expressiveness when applied to highly complex tasks, resulting in suboptimal performance~\cite{pan2024lisa}.
|
||||
While strategies like incorporating the Mixture-of-Experts (MoE) mechanism, which combines multiple specialized PEFT modules for multi-task adaptation, aim to enhance model capacity for diverse tasks, these MoE-based approaches can introduce additional challenges, including mitigating coupling effects and effectively managing the contributions of different experts~\cite{rajbhandari2022deepspeed}.
|
||||
|
||||
|
||||
Overall, adapting LLMs to diverse tasks presents two major challenges: (1) preserving rich pretrained general knowledge while specializing for specific tasks, and (2) extending the multi-task capabilities of Parameter-Efficient Methods.
|
||||
|
||||
|
||||
|
||||
Our approach is motivated by a key observation regarding LLM architectures: different components in the Transformer reveal different roles and activation behaviors.
|
||||
Existing literature suggests that Feed-Forward Network (FFN) layers, constituting the bulk of model parameters, primarily function as key repositories for storing and recalling general knowledge~\cite{geva2021transformer}.
|
||||
In contrast, self-attention mechanisms are primarily responsible for processing and integrating contextual information within the input sequence, capturing dependencies between tokens~\cite{jin2025massive}.
|
||||
This functional difference is also reflected in the parameter activation.
|
||||
While FFNs, comprising approximately 90\% of model parameters, exhibit high activation sparsity, self-attention mechanisms typically demonstrate denser activation patterns~\cite{cai2024survey, fedus2022switch, jaszczur2021sparse}.
|
||||
This denser engagement highlights its critical role in integrating latent general knowledge with contextual information derived from the input.
|
||||
|
||||
Given these differences, we argue that focusing on the modulation of self-attention during multi-task adaptation provides a more effective and specialized strategy.
|
||||
The key insight is that large-scale pre-training has equipped LLMs with extensive general knowledge, so effective adaptation should focus on enabling LLMs to better integrate task-specific contextual information.
|
||||
Such an approach can refine how general knowledge is integrated with specific contextual demands of diverse tasks. Importantly, this modulation preserves pre-trained general knowledge, thereby mitigating issues like catastrophic forgetting and task interference.
|
||||
|
||||
To this end, we introduce Contextual Attention Modulation (CAM), a novel mechanism designed to dynamically modulate the representations within the self-attention modules of LLMs based on the input context.
|
||||
CAM learns to dynamically modulate self-attention representations to adapt the input context.
|
||||
This context-aware mechanism selectively amplifies task-relevant attentional signals and suppresses irrelevant or interfering ones, thereby enhancing task-specific features while preserving the model's pre-trained general knowledge.
|
||||
Directly modulating the organization of contextual information within attention modules promotes more effective knowledge retention and specialized adaptation, thereby supporting more robust and efficient multi-task learning.
|
||||
|
||||
To extend the multi-task capabilities, we embed CAM into our Hybrid Contextual Attention Modulation (HyCAM) framework.
|
||||
HyCAM combines a shared, full-parameter CAM module, which is designed to capture and leverage common knowledge across all tasks, with multiple specialized, lightweight CAM modules.
|
||||
These specialized modules implement the CAM mechanism using PEFT techniques to efficiently capture distinct features, allowing effective multi-task adaptation with minimal additional trainable parameters.
|
||||
A soft-routing strategy, further augmented by a load-balancing constraint, dynamically manages the fusion of knowledge from these shared and specialized CAM components.
|
||||
This design empowers HyCAM to extend multi-task performance by enabling both efficient knowledge sharing and fine-grained specialization.
|
||||
|
||||
|
||||
The main contributions of this paper are summarized as follows:
|
||||
\begin{itemize}[leftmargin=*, topsep=0pt]
|
||||
\item We propose Contextual Attention Modulation (CAM), a novel mechanism that learns to dynamically modulate self-attention representations in LLMs based on input context.
|
||||
CAM is designed to enhance task-specific features while preserving pre-trained general knowledge, thereby facilitating more effective knowledge retention and specialized adaptation.
|
||||
\item We introduce the Hybrid Contextual Attention Modulation (HyCAM) framework, which extends multi-task adaptation capabilities by integrating our CAM mechanism in distinct forms. This integration empowers HyCAM to achieve superior multi-task performance by effectively balancing efficient knowledge sharing with fine-grained task specialization.
|
||||
\item We conduct extensive experiments across a range of tasks covering question answering, code generation, logical reasoning, and other domains. Comparative experiments demonstrate that HyCAM significantly outperforms existing state-of-the-art approaches with faster convergence.
|
||||
\end{itemize}
|
||||
\section{Preliminaries}
|
||||
This section briefly reviews the fundamental concepts essential for understanding our proposed method. We discuss the relevant components of the Transformer architecture, the basics of task-adaptive fine-tuning, and common PEFT techniques.
|
||||
|
||||
\subsection{Transformer Architecture}
|
||||
The Transformer architecture~\cite{vaswani2017attention} serves as the backbone of most LLMs owing to its ability to efficiently process sequences of data through attention mechanisms, making it especially powerful for understanding and generating human language.
|
||||
A Transformer model is typically composed of a stack of identical blocks.
|
||||
Each block primarily contains two core components: the self-attention mechanism and the Feed-Forward Network (FFN).
|
||||
Self-Attention mechanism allows the model to weigh the importance of different tokens in an input sequence and capture contextual relationships by computing attention scores using Query ($Q$), Key ($K$), and Value ($V$) projections, often via scaled dot-product attention: $Attention(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V$. Following this, the FFN, typically composed of two linear transformations with a non-linear activation, further processes each token's representation independently to express the complex knowledge of the model.
|
||||
|
||||
|
||||
\subsection{Task-Adaptive Fine-Tuning} \label{sec:finetune}
|
||||
|
||||
While LLMs acquire extensive general knowledge and reasoning capabilities, they typically require further adaptation to specialize them for specific tasks and align their behavior with desired objectives, such as following instructions.
|
||||
A common approach for such task-adaptive fine-tuning is Supervised Fine-Tuning (SFT). In SFT, the model learns from examples that provide explicit input-output pairings.
|
||||
These pairings might illustrate a question with its corresponding answer or an instruction followed by the desired model output.
|
||||
The primary goal is to adjust the model's parameters to minimize a task-specific loss function, such as cross-entropy loss for sequence generation or classification tasks.
|
||||
|
||||
|
||||
\subsection{Parameter-Efficient Fine-Tuning}
|
||||
Adapting LLMs to specific tasks often involves fine-tuning, but updating all parameters is computationally expensive. PEFT methods enable model adaptation by introducing a small set of new parameters or reparameterizing existing ones while keeping the backbone model weights frozen, significantly reducing computational costs.
|
||||
|
||||
A mainstream PEFT category is reparameterization, which introduces a smaller set of trainable parameters that efficiently influence the model's behavior.
|
||||
For instance, a common strategy is to represent the change in a pre-trained weight matrix $W_0$ during adaptation as a low-rank update, based on the observation that task-specific changes often lie in a subspace of much lower dimensionality than the full parameter space.
|
||||
Thus, instead of learning a large, dense update matrix $\Delta W$, these methods learn a low-rank approximation of it, such as $\Delta W = BA$, where $B$ and $A$ are much smaller matrices~\cite{hu2021lora}.
|
||||
|
||||
\begin{figure*}[ht]
|
||||
\centering
|
||||
\includegraphics[width=0.75\linewidth]{assets/model_v4.pdf}
|
||||
\caption{The architecture of the CAM and HyCAM framework. HyCAM applies a hybrid CAM mechanism to the output of the Attention module within each Transformer block, while the backbone LLM remains frozen. Specifically, HyCAM integrates a shared, full-parameter CAM module and multiple lightweight Specialized CAMs for common and task-specific knowledge.}
|
||||
\label{fig:model}
|
||||
\end{figure*}
|
||||
|
||||
\section{Method}
|
||||
We first illustrate an overview of our proposed HyCAM framework.
|
||||
Next, the core CAM mechanism is further detailed. We then provide an in-depth description of the HyCAM framework, including its hybrid components and dynamic knowledge fusion strategies with a soft-routing method and load-balancing constraint, and conclude by specifying the training objective.
|
||||
\subsection{Framework Overview}
|
||||
To address the critical challenge of enabling LLMs to efficiently adapt to diverse tasks while balancing knowledge retention with task-specific specialization, we introduce the Hybrid Contextual Attention Modulation (HyCAM) framework.
|
||||
The core mechanism of HyCAM is Contextual Attention Modulation (CAM), which dynamically learns context-dependent modulation of self-attention representations, selectively amplifying task-relevant signals while suppressing irrelevant or potentially interfering ones to enhance task-specific features and preserve general knowledge.
|
||||
As illustrated in Figure~\ref{fig:model}, the HyCAM framework employs a novel hybrid architecture that integrates a shared, full-parameter CAM module, designed for capturing common knowledge across tasks, with multiple specialized CAM modules that utilize parameter-efficient techniques for efficient, fine-grained adaptation to distinct task features.
|
||||
The contributions of these diverse CAM modules are managed by a dynamic routing strategy to ensure balanced utilization of the specialized components and adaptive knowledge fusion.
|
||||
|
||||
\subsection{Contextual Attention Modulation} \label{sec:cam}
|
||||
The CAM mechanism is the core of our HyCAM framework, designed to dynamically modulate self-attention representations at each Transformer block.
|
||||
It learns to dynamically amplify task-relevant attentional signals and suppress irrelevant ones based on the input context, thereby enhancing task-specific features while preserving the model's pre-trained general knowledge, which facilitates more effective and efficient task adaptation.
|
||||
|
||||
\subsubsection{\textbf{Motivation}}
|
||||
Our motivation for developing CAM comes from the analysis of distinct roles and activation patterns across different Transformer components, as described in Section~\ref{sec:intro}.
|
||||
While FFN modules account for a large portion of parameters and store a vast amount of an LLM's parameterized knowledge, self-attention modules are crucial for dynamically processing and integrating contextual information.
|
||||
The varying activation patterns of these components highlight the important role of the self-attention modules in integrating latent general knowledge with the specific context derived from an input.
|
||||
With extensive general knowledge from large-scale pre-training of LLMs, the key to effective adaptation lies in enabling them to better integrate this foundational knowledge with task-specific contextual information.
|
||||
Conventional fine-tuning approaches, however, can often overwrite the valuable pre-trained representations by introducing new task-specific knowledge.
|
||||
|
||||
This observation motivated us to develop CAM, a mechanism that refines how general knowledge is integrated with specific contextual demands of diverse tasks by modulating self-attention representations. This approach aims to facilitate task-adaptive specialization while preserving valuable pre-trained knowledge.
|
||||
|
||||
\subsubsection{\textbf{The CAM Mechanism}} \label{subsec:camdetail}
|
||||
The CAM mechanism is integrated into each Transformer block, operating on the output of the self-attention modules to dynamically modulate its representations based on the input context. This process allows for a fine-grained modulation of contextual information flow.
|
||||
Specifically, the CAM mechanism proceeds as follows:
|
||||
|
||||
\paratitle{Input Normalization: }
|
||||
Let $h_{in} \in \mathbb{R}^{L \times d}$ be the input hidden state to a Transformer layer, where $L$ denotes the sequence length and $d$ represents the hidden dimension.
|
||||
Consistent with standard Transformer operations, these input hidden states are first normalized using Layer Normalization~\cite{ba2016layer}, producing $h_{norm} \in \mathbb{R}^{L \times d}$:
|
||||
\begin{equation}
|
||||
h_{norm} = \text{LayerNorm}(h_{in}).
|
||||
\end{equation}
|
||||
The resulting $h_{norm}$ serves as the input for both the conventional self-attention computation and our CAM module.
|
||||
|
||||
|
||||
|
||||
\paratitle{Modulation Weight Generation: }
|
||||
CAM then computes a context-dependent modulation weight tensor, denoted as $\mathbf{A}_{\text{CAM}} \in \mathbb{R}^{L \times d}$.
|
||||
These weights are derived from the normalized hidden state $h_{norm}$ through a linear projection parameterized by a trainable weight matrix $W_{proj} \in \mathbb{R}^{d \times d}$, followed by a SiLU activation function~\cite{elfwing2018sigmoid}:
|
||||
\begin{equation}
|
||||
\mathbf{A}_{\text{CAM}} = \text{SiLU}(h_{norm} W_{proj}).
|
||||
\end{equation}
|
||||
The matrix $W_{proj}$ is specific to the CAM module and is crucial for learning how to modulate the attention representations based on the input context.
|
||||
To ensure stability during the initial phases of fine-tuning and to allow the model to gradually learn the modulation, $W_{proj}$ is initialized as a zero matrix.
|
||||
This initialization strategy ensures that at the beginning of the fine-tuning, CAM does not alter the pre-trained model's behavior. That is, the model initially maintains its original approach to processing contextual information, which is then gradually modulated as training progresses for a stable adaptation.
|
||||
|
||||
\paratitle{Application of Modulation: }
|
||||
Concurrently, the standard attention output $h_{att} \in \mathbb{R}^{L \times d}$ is computed using the normalized input $h_{norm}$:
|
||||
\begin{equation}
|
||||
h_{att} = \text{Self-Attention}(h_{norm}).
|
||||
\label{eq:oriattn}
|
||||
\end{equation}
|
||||
The CAM mechanism then refines this $h_{att}$ by applying the learned modulation weights $\mathbf{A}_{\text{CAM}}$. This is performed via an element-wise Hadamard product ($\odot$).
|
||||
The modulated signal is integrated with the original $h_{att}$ through a residual connection, forming the final output $h_{out} \in \mathbb{R}^{L \times d}$ of the attention mechanism incorporating CAM.
|
||||
\begin{equation}
|
||||
h_{out} = h_{att} + h_{att} \odot \mathbf{A}_{\text{CAM}}.
|
||||
\end{equation}
|
||||
|
||||
\subsubsection{\textbf{Advantages}}
|
||||
By dynamically generating and applying these modulation weights, CAM refines the contextual representation from the self-attention modules to adapt it to specific tasks while preserving the pre-trained general knowledge, thereby mitigating catastrophic forgetting.
|
||||
Thus, CAM facilitates an effective balance between achieving task-specific adaptation and retaining extensive general knowledge.
|
||||
Moreover, by modulating attentional outputs instead of fine-tuning a large number of backbone parameters, CAM achieves computational efficiency.
|
||||
|
||||
\vspace{-5px}
|
||||
\subsection{The HyCAM Framework}
|
||||
While the CAM mechanism provides a powerful tool for modulating attention representations,
|
||||
adapting LLMs to handle multiple diverse tasks simultaneously presents significant challenges.
|
||||
Conventional full fine-tuning struggles with catastrophic forgetting and resource demands, while existing PEFT methods still face limitations for multi-tasking.
|
||||
Specifically, the limited capacity of representation makes it suboptimal for highly complex tasks, and simple applications of expert-based strategies lead to an imbalance in expert utilization.
|
||||
|
||||
To address these multiple challenges and effectively leverage the CAM mechanism for complex multi-task learning scenarios, we introduce the HyCAM framework.
|
||||
The framework is designed to extend the multi-task adaptation capabilities by integrating CAM in hybrid forms, enabling both efficient knowledge sharing and fine-grained task specialization.
|
||||
This is achieved through strategically combining shared, full-parameter CAM module, for efficient knowledge sharing, with multiple specialized, parameter-efficient CAM modules, for fine-grained specialization.
|
||||
The contributions of these components are coordinated by a dynamic routing mechanism with a load-balancing constraint to ensure adaptive knowledge fusion.
|
||||
|
||||
\subsubsection{\textbf{Hybrid CAM Components}}
|
||||
The hybrid architecture of the HyCAM framework is designed to leverage both general context understanding and specialized, task-specific adaptation capabilities. This architecture comprises a shared, full-parameter CAM module and multiple lightweight, specialized CAM modules:
|
||||
|
||||
\paratitle{Shared CAM Module: }
|
||||
The Shared CAM module serves as a global modulator, for capturing and refining common contextual patterns and general knowledge across all tasks. This module is a full-parameter CAM, as detailed in Section~\ref{sec:cam}. Its trainable projection matrix, denoted as $W_{Shared} \in \mathbb{R}^{d \times d}$, is shared and updated across all tasks to produce a modulation weight tensor:
|
||||
\begin{equation}
|
||||
\mathbf{A}_{Shared} = \text{SiLU}(h_{norm}W_{Shared}).
|
||||
\end{equation}
|
||||
|
||||
\paratitle{Specialized CAM Modules: }
|
||||
In addition to the shared module, HyCAM incorporates multiple ($N_s$) lightweight Specialized CAM modules.
|
||||
Specialized CAM modules are designed to learn and apply attention modulations for the distinct features of specific tasks.
|
||||
|
||||
Different tasks often require different ways of handling contextual information in the self-attention layer. For example, code generation may need to focus on long-range dependencies, while question answering systems may prioritize specific entities and their relationships in a local context. This design is to enable the model to develop fine-grained adaptations for
|
||||
diverse tasks, thereby mitigating the interference when a single component attempts to learn potentially conflicting objectives from multiple tasks.
|
||||
|
||||
The implementation of Specialized CAM modules leverages the PEFT technique for reducing the number of trainable parameters per specialized module, making the framework scalable.
|
||||
Besides, it helps in mitigating overfitting, especially when task-specific data might be limited.
|
||||
Specifically, each Specialized CAM module, indexed by $k \in \{1, ..., N_s\}$, generates its unique modulation weight tensor $\mathbf{A}_{\text{Spec}_k} \in \mathbb{R}^{L \times d}$ as follows:
|
||||
\begin{equation}
|
||||
\mathbf{A}_{\text{Spec}_k} = \text{SiLU}(h_{norm} W_{\text{Spec}_k}),
|
||||
\end{equation}
|
||||
where $W_{\text{Spec}_k}$ is the trainable projection matrix specific to the $k$-th specialized module. To achieve parameter efficiency while enhancing representational capacity, we adopt the SLoRA~\cite{guo2025nlora} technique for the structure of $W_{Spec_k}$. Instead of a direct low-rank decomposition like LoRA, typically $W = BA$, SLoRA introduces an intermediate trainable matrix $N$ between $B$ and $A$. Thus, $W_{Spec_k}$ is parameterized as:
|
||||
\begin{equation}
|
||||
W_{Spec_k} = B_k N_k A_k.
|
||||
\end{equation}
|
||||
Here, $A_k \in \mathbb{R}^{r \times d}$ is a matrix that projects the $d$-dimensional hidden state $h_{norm}$ into a lower-dimensional space of rank $r$. $N_k \in \mathbb{R}^{r \times r}$ is a trainable intermediate matrix within the low-rank space.
|
||||
$B_k \in \mathbb{R}^{d \times r}$ is a matrix that projects the $r$-dimensional representation back to the original $d$-dimensional space.
|
||||
The rank $r$ is significantly smaller than $d$, ensuring a substantial reduction in trainable parameters compared to a full $d \times d$ matrix.
|
||||
|
||||
For initialization, and similar to the zero-initialization of $W_{\text{Shared}}$ in the Shared CAM module, we adopt a strategy to ensure training stability. Specifically, the matrices $A_k$ and $N_k$ are initialized using Kaiming Uniform~\cite{he2015delving}. The matrix $B_k$ is initialized with zeros. This structure allows each Specialized CAM to develop task-specific modulations with very small parameters, thus enhancing the adaptability of the model without sacrificing efficiency.
|
||||
|
||||
\subsubsection{\textbf{Dynamic Routing}} \label{sec:routing}
|
||||
To effectively leverage the diverse contributions from the Shared CAM and multiple Specialized CAM modules, HyCAM incorporates a dynamic soft-routing mechanism coupled with a load-balancing constraint.
|
||||
This mechanism adaptively determines the influence of each module based on the input context and promotes load-balance to ensure efficient utilization of all Specialized CAMs.
|
||||
|
||||
|
||||
\paratitle{Routing for Specialized CAMs: }
|
||||
The dynamic routing mechanism weights the contributions of the $N_s$ Specialized CAM modules for each input token. This enables HyCAM to adapt its modulation strategy in a fine-grained, context-dependent manner. The routing process is detailed as follows:
|
||||
|
||||
For each token representation $h_{norm}$, derived from $h_{in}$ as described in Section~\ref{subsec:camdetail}, a lightweight router network first generates $\mathbf{logits} \in \mathbb{R}^{N_s}$, produced by a linear layer applied to $h_{norm}$:
|
||||
\begin{equation}
|
||||
\mathbf{logits} = h_{norm} W_{router},
|
||||
\end{equation}
|
||||
where $W_{router} \in \mathbb{R}^{d \times N_s}$ is the trainable weight matrix of the router.
|
||||
|
||||
These $\mathbf{logits}= [\pi_1, \pi_2, ..., \pi_{N_s}]$ are then transformed into a probability distribution over the specialized modules using the Gumbel-Softmax estimator~\cite{jang2016categorical} to obtain differentiable, soft routing probabilities.
|
||||
The Gumbel-Softmax allows for differentiable sampling from a categorical distribution, which facilitates the training process while encouraging exploration, as detailed:
|
||||
\begin{equation}
|
||||
p_k = \frac{\exp((\log \pi_k + g_k)/\tau)}{\sum_{j=1}^{N_s} \exp((\log \pi_j + g_j)/\tau)},
|
||||
\label{eq:gumbel_softmax}
|
||||
\end{equation}
|
||||
where $p_k$ is the resulting soft routing weight for the $k$-th Specialized CAM module. $g_k \sim \text{Gumbel}(0,1)$ are \iid noise drawn from the Gumbel distribution, adding stochasticity for exploration. $\tau$ is a temperature hyperparameter that controls the sharpness of the probability distribution. Lower temperatures make the selection more discrete, while higher temperatures make it softer.
|
||||
|
||||
\paratitle{Load Balancing Loss: }
|
||||
To prevent routers from over-selecting a few modules, HyCAM adds a load-balancing loss $\mathcal{L}_{balance}$ that encourages more balanced routing across specialized components. For a batch of $B$ tokens, it is computed as:
|
||||
|
||||
\begin{equation}
|
||||
\mathcal{L}_{balance} = \sum_{k=1}^{N_s} \left( \frac{1}{B} \sum_{b=1}^{B} p_{b,k} \right) \cdot \left( \frac{1}{B} \sum_{b=1}^{B} \text{softmax}(\mathbf{logits}_{b})_k \right),
|
||||
\label{eq:load_balance_loss}
|
||||
\end{equation}
|
||||
where $p_{b,k}$ is the Gumbel-Softmax output and $\text{softmax}(\mathbf{logits}_{b})_k$ is the standard softmax output of the router logits.
|
||||
|
||||
\paratitle{Fusion of Modulations: }
|
||||
Once the routing weights $p_k$ are determined for each token, as described in Equation~\ref{eq:gumbel_softmax}, the final context-dependent modulation tensor, $\mathbf{A}_{Fusion} \in \mathbb{R}^{L \times d}$, is computed by combining the output of the Shared CAM module, $\mathbf{A}_{Shared}$, with the dynamically weighted sum of the modulations from all Specialized CAM modules, $\{\mathbf{A}_{Spec_k}\}_{k=1}^{N_s}$:
|
||||
\begin{equation}
|
||||
\mathbf{A}_{Fusion} = \mathbf{A}_{Shared} + \sum_{k=1}^{N_s} p_k \cdot \mathbf{A}_{Spec_k},
|
||||
\label{eq:fusion_modulation}
|
||||
\end{equation}
|
||||
Here, $p_k$ denotes the token-specific routing weight of the $k$-th specialized module, ensuring that the context-based modulation of $\mathbf{A}_{Fusion}$ integrates both general and adaptively selected specialized knowledge.
|
||||
Finally, it is applied to the original self-attention output $h_{att}$, from Equation~\ref{eq:oriattn} in Section~\ref{subsec:camdetail}, to produce the HyCAM-enhanced output $h_{out}$ using the element-wise Hadamard product and residual connection, as defined in the core CAM mechanism:
|
||||
\begin{equation}
|
||||
h_{out} = h_{att} + h_{att} \odot \mathbf{A}_{Fusion}.
|
||||
\end{equation}
|
||||
This entire mechanism, from dynamic routing to the application of the fused modulation, allows HyCAM to dynamically modulate the self-attention process by integrating shared knowledge with specialized insights, thereby enabling the model to effectively balance generalization across diverse tasks with task-specific adaptation.
|
||||
|
||||
\begin{table*}[t]
|
||||
\small
|
||||
\centering
|
||||
\caption{Datasets statistics.}
|
||||
\label{tab:dataset}
|
||||
\resizebox{0.85\linewidth}{!}{
|
||||
\renewcommand{\arraystretch}{0.94}
|
||||
\begin{threeparttable}[b]
|
||||
\begin{tabular}{lccccc}
|
||||
\toprule
|
||||
Dataset & Samples & Total Tokens\tnote{1} & Avg. Tokens/Sample\tnote{1} & Domain & Source \\
|
||||
\midrule
|
||||
Auto CoT & 5,816 & 943,474 & 162.22 & Arithmetic and other logical reasoning tasks & \cite{zhang2023automatic}\\
|
||||
iCliniq & 7,321 & 1,826,306 & 249.46 & Conversations between patients and doctors & \cite{li2023chatdoctor}\\
|
||||
Dolly 2.0 & 15,015 & 3,061,007 & 203.86 & Closed QA and summarization from Wikipedia & \cite{DatabricksBlog2023DollyV2}\\
|
||||
CodeAlpaca & 20,222 & 2,195,523 & 109.66 & Code generation and optimization & \cite{codealpaca}\\
|
||||
WebGPT & 18,994 & 13,988,895 & 736.49 & Information retrieval QA & \cite{nakano2021webgpt}\\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\begin{tablenotes}
|
||||
\item[1] Calculated by Llama-3 Tokenizer.
|
||||
\end{tablenotes}
|
||||
\end{threeparttable}
|
||||
}
|
||||
\end{table*}
|
||||
|
||||
\begin{table*}[t]
|
||||
\centering
|
||||
\caption{Experimental results across different backbone LLMs.
|
||||
\textbf{*}indicates the statistically significant improvements (\ie two-sided t-test with $p<0.05$) over the best PEFT baseline. Lower PPL$\downarrow$ is better, where higher BLEU$\uparrow$ and ROUGE$\uparrow$ reflect higher quality. The best results are bolded, while the second-best results are underlined.
|
||||
}
|
||||
\label{tab:exp1}
|
||||
\resizebox{0.95\linewidth}{!}{
|
||||
\renewcommand{\arraystretch}{1}
|
||||
\begin{tabular}{l|ccc|ccc|ccc|ccc|ccc}
|
||||
\toprule
|
||||
Backbone LLM & \multicolumn{3}{c|}{Llama 2 7B}& \multicolumn{3}{c|}{Llama 3 8B} & \multicolumn{3}{c|}{Llama 3.1 8B} & \multicolumn{3}{c|}{Mistral 7B} & \multicolumn{3}{c}{Qwen 2.5 7B} \\ \midrule
|
||||
Metric & PPL$\downarrow$ & BLEU$\uparrow$ & ROUGE$\uparrow$ & PPL$\downarrow$ & BLEU$\uparrow$ & ROUGE$\uparrow$ & PPL$\downarrow$ & BLEU$\uparrow$ & ROUGE$\uparrow$& PPL$\downarrow$ & BLEU$\uparrow$ & ROUGE$\uparrow$& PPL$\downarrow$ & BLEU$\uparrow$ & ROUGE$\uparrow$\\ \midrule
|
||||
Full Finetune & 3.193 & \underline{0.171} & 0.231 & 3.978 & 0.151 & 0.203 & 3.873 & 0.153 & 0.205 & 4.403 & 0.157 & 0.192 & 3.024 & \underline{0.169} & 0.225 \\
|
||||
LoRA & 3.222 & 0.157 & 0.225 & 3.556 & 0.148 & 0.24 & 3.537 & 0.156 & 0.237 & \underline{3.418} & \underline{0.163} & \underline{0.244} & 2.840 & 0.137 & \underline{0.239} \\
|
||||
\midrule
|
||||
Multi LoRA & 3.287 & 0.121 & 0.217 & 3.547 & 0.157 & 0.236 & 3.653 & 0.134 & 0.235 & 3.461 & 0.141 & 0.225 & 3.069 & 0.136 & 0.222 \\
|
||||
RieMoE-LoRA & \underline{3.171} & 0.154 & \underline{0.232} & \underline{3.497} & \underline{0.159} & \underline{0.242} & \underline{3.487} & \underline{0.161} & \underline{0.238} & 3.597 & 0.143 & 0.24 & \underline{2.830} & 0.157 & 0.227 \\
|
||||
HyCAM & \textbf{3.081*} & \textbf{0.173*} & \textbf{0.244*} & \textbf{3.484*} & \textbf{0.162*} & \textbf{0.245*} & \textbf{3.453*} & \textbf{0.172*} & \textbf{0.251*} & \textbf{3.299*} & \textbf{0.171*} & \textbf{0.249*} & \textbf{2.757*} & \textbf{0.172*} & \textbf{0.248*} \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
}
|
||||
\end{table*}
|
||||
|
||||
\subsection{Training Details}
|
||||
The HyCAM framework, including the Shared CAM module, the Specialized CAM modules, and the dynamic router, is trained end-to-end.
|
||||
We use a composite objective function that combines a primary task-specific loss with the auxiliary load-balancing loss, described in Section~\ref{sec:routing}.
|
||||
This approach ensures that the model not only learns to perform the target tasks effectively but also maintains balanced utilization of its specialized components, leading to efficient adaptation across diverse tasks and enhanced overall multi-task performance.
|
||||
|
||||
|
||||
\paratitle{Task-specific Loss: }
|
||||
We employ a standard autoregressive training strategy common for LLMs, as introduced in Section~\ref{sec:finetune}, where the model is trained to predict the next token in a sequence given the input context.
|
||||
Given an input sequence $\mathbf{X} = (x_1, x_2, \dots, x_m)$ and its corresponding target sequence $\mathbf{Y} = (y_1, y_2, \dots, y_n)$, the model is trained to predict each token $y_t$ conditioned on the input $\mathbf{X}$ and the previous target tokens $\mathbf{Y}_{<t} = (y_1, y_2, \dots, y_{t-1})$. The primary objective is to minimize the cross-entropy loss of the target sequences:
|
||||
\begin{equation}
|
||||
\mathcal{L}_{\text{task}} = -\sum_{i=1}^{|\mathcal{D}|} \sum_{t=1}^{n_i} \log P(y_{i,t} | \mathbf{X}_i, \mathbf{Y}_{i,<t}; \Theta_\text{HyCAM}),
|
||||
\label{eq:sft_loss}
|
||||
\end{equation}
|
||||
where $\mathcal{D}$ represents the batch of training examples, $n_i$ is the length of the $i$-th target sequence, and $P(y_{i,t} | \mathbf{X}_i, \mathbf{Y}_{i,<t}; \Theta_{HyCAM})$ is the probability of the true token $y_{i,t}$ predicted by the HyCAM framework with its trainable parameters $\Theta_\text{HyCAM}$.
|
||||
|
||||
\paratitle{Overall Training Objective: }
|
||||
To ensure diverse utilization of the Specialized CAM modules, we incorporate the auxiliary load-balancing loss $\mathcal{L}_{\text{balance}}$, as defined in Equation~\ref{eq:load_balance_loss}, into the overall training objective. The final training loss $\mathcal{L}_{\text{total}}$ that HyCAM optimizes is therefore a weighted sum of the task loss and the load-balancing loss:
|
||||
\begin{equation}
|
||||
\mathcal{L}_{\text{total}} = \mathcal{L}_{\text{task}} + \lambda_{\text{balance}} \cdot \mathcal{L}_{\text{balance}}.
|
||||
\label{eq:total_loss}
|
||||
\end{equation}
|
||||
Here, $\lambda_{\text{balance}}$ is a hyperparameter that controls the contribution of the load-balancing constraint.
|
||||
By optimizing $\mathcal{L}_{\text{total}}$, the HyCAM framework learns to effectively perform the target tasks while maintaining a balanced and efficient use of its specialized components.
|
||||
|
||||
\begin{table*}[t]
|
||||
\centering
|
||||
\caption{Results with different sizes of Qwen 2.5-Family.}
|
||||
\label{tab:expdiffqwen}
|
||||
\resizebox{0.95\linewidth}{!}{
|
||||
\renewcommand{\arraystretch}{1}
|
||||
\begin{tabular}{l|ccc|ccc|ccc|ccc|ccc}
|
||||
\toprule
|
||||
Backbone & \multicolumn{3}{c|}{Qwen 2.5 0.5B}& \multicolumn{3}{c|}{Qwen 2.5 1.5B} & \multicolumn{3}{c|}{Qwen 2.5 3B} & \multicolumn{3}{c|}{Qwen 2.5 7B} & \multicolumn{3}{c}{Qwen 2.5 14B} \\ \midrule
|
||||
Metric & PPL$\downarrow$ & BLEU$\uparrow$ & ROUGE$\uparrow$ & PPL$\downarrow$ & BLEU$\uparrow$ & ROUGE$\uparrow$ & PPL$\downarrow$ & BLEU$\uparrow$ & ROUGE$\uparrow$& PPL$\downarrow$ & BLEU$\uparrow$ & ROUGE$\uparrow$& PPL$\downarrow$ & BLEU$\uparrow$ & ROUGE$\uparrow$\\ \midrule
|
||||
Full Finetune & 3.778 & \underline{0.159} & 0.219 & \textbf{3.102} & \textbf{0.169} & \underline{0.235} & \underline{2.982} & \underline{0.161} & 0.222 & 3.024 & \underline{0.169} & 0.225 & 2.839 & \textbf{0.176} & 0.214 \\
|
||||
LoRA & 3.764 & 0.145 & 0.222 & 3.344 & 0.138 & 0.229 & 3.106 & 0.144 & 0.230 & 2.840 & 0.137 & \underline{0.239} & 2.889 & 0.147 & \underline{0.238} \\
|
||||
\midrule
|
||||
Multi LoRA & 3.754 & 0.144 & 0.221 & 3.330 & 0.148 & 0.226 & 3.053 & 0.157 & 0.225 & 3.069 & 0.136 & 0.222 & 2.882 & 0.152 & 0.235 \\
|
||||
RieMoE-LoRA & \underline{3.621} & 0.152 & \underline{0.232} & 3.180 & 0.148 & 0.230 & 3.001 & 0.148 & \underline{0.238} & \underline{2.83} & 0.157 & 0.227 & \underline{2.792} & 0.142 & \underline{0.238} \\
|
||||
HyCAM & \textbf{3.611} & \textbf{0.169} & \textbf{0.262} & \underline{3.108} & \underline{0.167} & \textbf{0.236} & \textbf{2.940} & \textbf{0.165} & \textbf{0.249} & \textbf{2.757} & \textbf{0.172} & \textbf{0.248} & \textbf{2.682} & \underline{0.160} & \textbf{0.242} \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
}
|
||||
\end{table*}
|
||||
|
||||
\begin{table}[t]
|
||||
\centering
|
||||
\caption{Results with different sizes of Llama 3.2.}
|
||||
\label{tab:expsmallllama}
|
||||
\resizebox{0.95\linewidth}{!}{
|
||||
\renewcommand{\arraystretch}{1}
|
||||
\begin{tabular}{l|ccc|ccc|ccc|ccc|ccc}
|
||||
\toprule
|
||||
Backbone & \multicolumn{3}{c|}{Llama 3.2 1B}& \multicolumn{3}{c|}{Llama 3.2 3B}\\ \midrule
|
||||
Metric & PPL$\downarrow$ & BLEU$\uparrow$ & ROUGE$\uparrow$& PPL$\downarrow$ & BLEU$\uparrow$ & ROUGE$\uparrow$\\ \midrule
|
||||
Full Finetune & \textbf{4.221} & \textbf{0.164} & 0.221 & \textbf{3.747} & \underline{0.159} & 0.220 \\
|
||||
LoRA & 4.515 & 0.144 & 0.227 & 3.824 & 0.144 & \underline{0.234} \\
|
||||
\midrule
|
||||
Multi LoRA & 4.533 & 0.143 & 0.225 & 3.876 & 0.149 & 0.232 \\
|
||||
RieMoE-LoRA & 4.324 & 0.161 & \underline{0.241} & 3.806 & 0.154 & 0.233 \\
|
||||
HyCAM & \underline{4.227} & \underline{0.163} & \textbf{0.244} & \underline{3.778} & \textbf{0.167} & \textbf{0.243} \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
}
|
||||
\end{table}
|
||||
|
||||
|
||||
\begin{table*}[t]
|
||||
\centering
|
||||
\caption{Detailed experimental results across the five datasets.}
|
||||
\label{tab:crosstaskresult}
|
||||
\resizebox{0.95\linewidth}{!}{
|
||||
\renewcommand{\arraystretch}{1}
|
||||
\begin{tabular}{l|ccc|ccc|ccc|ccc|ccc}
|
||||
\toprule
|
||||
Backbone LLM & \multicolumn{3}{c|}{
|
||||
Auto CoT}& \multicolumn{3}{c|}{iCliniq} & \multicolumn{3}{c|}{Dolly 2.0} & \multicolumn{3}{c|}{CodeAlpaca} & \multicolumn{3}{c}{WebGPT} \\ \midrule
|
||||
Metric & PPL$\downarrow$ & BLEU$\uparrow$ & ROUGE$\uparrow$ & PPL$\downarrow$ & BLEU$\uparrow$ & ROUGE$\uparrow$ & PPL$\downarrow$ & BLEU$\uparrow$ & ROUGE$\uparrow$& PPL$\downarrow$ & BLEU$\uparrow$ & ROUGE$\uparrow$& PPL$\downarrow$ & BLEU$\uparrow$ & ROUGE$\uparrow$\\ \midrule
|
||||
Full Finetune & 1.842 & \underline{0.282} & 0.287 & \textbf{7.497} & \textbf{0.053} & 0.123 & 6.461 & 0.088 & \textbf{0.200} & 2.532 & 0.138 & 0.195 & \underline{1.888} & \textbf{0.182} & \textbf{0.341} \\
|
||||
LoRA & 1.843 & 0.268 & 0.291 & 8.140 & 0.049 & \underline{0.124} & 6.029 & 0.070 & 0.181 & 2.404 & \underline{0.146} & 0.202 & 1.919 & 0.178 & 0.331 \\
|
||||
\midrule
|
||||
Multi LoRA & 1.952 & 0.198 & 0.290 & 8.846 & 0.037 & 0.122 & \textbf{5.743} & 0.101 & 0.177 & \textbf{2.312} & 0.134 & 0.189 & 1.939 & 0.176 & \underline{0.337} \\
|
||||
RieMoE-LoRA & \underline{1.813} & 0.275 & \textbf{0.298} & 8.001 & 0.051 & 0.123 & 5.954 & \textbf{0.106} & 0.183 & 2.381 & 0.142 & \underline{0.207} & \underline{1.888} & 0.177 & 0.336 \\
|
||||
HyCAM & \textbf{1.777} & \textbf{0.283} & \underline{0.297} & \underline{7.546} & \textbf{0.053} & \textbf{0.125} & \underline{5.893} & \underline{0.093} & \underline{0.194} & \underline{2.359} & \textbf{0.163} & \textbf{0.222} & \textbf{1.845} & \underline{0.180} & \underline{0.337} \\
|
||||
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
}
|
||||
\end{table*}
|
||||
|
||||
|
||||
\section{Experiments}
|
||||
In this section, we present a comprehensive evaluation of our proposed Hybrid Contextual Attention Modulation (HyCAM) framework.
|
||||
To systematically evaluate the performance and characteristics of HyCAM, we conduct experimental analysis around the following key research questions (RQs):
|
||||
\begin{itemize}[leftmargin=*]
|
||||
\item \textbf{RQ1}: How does the HyCAM framework perform overall in multi-task adaptation compared to state-of-the-art baseline methods across various backbone models?
|
||||
\item \textbf{RQ2}: How does HyCAM scale when applied to backbone Large Language Models of different sizes?
|
||||
\item \textbf{RQ3}: How does HyCAM perform on individual tasks within the multi-task benchmark, and does it demonstrate a balanced adaptation capability across these diverse task types?
|
||||
\item \textbf{RQ4}: What are the contributions of the core CAM mechanism and other components of HyCAM to its overall effectiveness, and how sensitive is its performance to key hyperparameters?
|
||||
\item \textbf{RQ5}: What qualitative insights do visualizations offer regarding HyCAM's internal working mechanisms?
|
||||
\end{itemize}
|
||||
|
||||
\subsection{Experimental Setup}
|
||||
\subsubsection{\textbf{Datasets}}
|
||||
To better evaluate the effectiveness of HyCAM in complex multi-task scenarios, we construct a comprehensive benchmark dataset comprising five distinct domains: Auto CoT (logical reasoning), iCliniq (medical QA), Dolly 2.0 (general instruction-following), CodeAlpaca (code generation), and WebGPT (information retrieval QA).
|
||||
Detailed information, including statistics, domains, and sources, is presented in Table \ref{tab:dataset}. Further, for robust evaluation across these datasets, we employ a 7:2:1 split for training, validation, and testing, and conduct five-fold cross-validation.
|
||||
|
||||
\subsubsection{\textbf{Backbone Models}}
|
||||
We evaluate HyCAM across three state-of-the-art open-source LLM families to demonstrate its applicability:
|
||||
|
||||
\paratitle{LLaMA.} We utilize multiple versions from Meta AI's LLaMA series, which is known for strong performance in zero-shot and few-shot scenarios with large amounts of pre-training data.
|
||||
Specifically, our experiments include Llama2-7B, Llama3-8B, Llama3.1-8B, and the smaller Llama3.2-1B/3B models.
|
||||
|
||||
\paratitle{Mistral.} Introduced by Mistral AI,
|
||||
Mistral features a compact model to achieve comparable performance to the mainstream methods via knowledge distillation.
|
||||
We use Mistral-7B-v0.3 as our base model.
|
||||
|
||||
|
||||
\paratitle{Qwen.}
|
||||
Developed by Alibaba, Qwen focuses on efficient inference and robust performance across diverse tasks.
|
||||
We adopt various sizes of Qwen2.5 series, ranging from 0.5B to up to 14B.
|
||||
|
||||
|
||||
\subsubsection{\textbf{Baseline Methods}}
|
||||
We evaluate HyCAM against several representative methods of task adaptation, detailed as follows:
|
||||
|
||||
\begin{itemize}[leftmargin=*]
|
||||
\item \textbf{Full Fine-Tuning} involves updating all parameters of the backbone LLM for adaptation to the target tasks.
|
||||
\item \textbf{LoRA}~\cite{hu2021lora} injects trainable low-rank adaptation matrices, allowing efficient adaptation with reduced trainable parameters.
|
||||
\item \textbf{Multi LoRA}~\cite{wang2023multilora} applies multiple LoRA adapters in parallel, enabling independent adaptation for different tasks.
|
||||
\item \textbf{RieMoE-LoRA}~\cite{sun2025stronger} integrates Riemannian gradient rescaling with MoE-LoRA to preserve expressiveness while stabilizing optimization and accelerating convergence.
|
||||
\end{itemize}
|
||||
|
||||
\subsubsection{\textbf{Evaluation Metrics}} To comprehensively evaluate the performance of methods across diverse tasks and domains, we adopt three commonly used evaluation metrics:
|
||||
|
||||
\noindent\textbf{PPL} (Perplexity) assesses the fluency and overall language generation quality. Lower PPL scores indicate better performance.\\
|
||||
\textbf{BLEU-4} (Bilingual Evaluation Understudy)~\cite{papineni2002bleu} measures the n-gram overlap between generated and label texts, particularly relevant for tasks like code generation. Higher BLEU-4 scores are better. \\
|
||||
\textbf{ROUGE-L} (Recall-Oriented Understudy for Gisting Evaluation - Longest Common Subsequence)~\cite{lin2004rouge} captures content overlap and semantic similarity, especially for longer outputs like summaries. Higher ROUGE-L scores indicate better performance.
|
||||
|
||||
\subsubsection{\textbf{Implementation Details}}
|
||||
All our experiments are implemented using PyTorch, and we leverage DeepSpeed for efficient training with BFloat16 mixed-precision.
|
||||
For LoRA-based methods, the LoRA rank ($r$) is set to 64 and is applied to all linear layers of LLM.
|
||||
For methods involving multiple specialized modules, the number of modules ($N_s$) is set to 5.
|
||||
The maximum token length is set to 1,200 to accommodate long-text tasks.
|
||||
For HyCAM-specific hyperparameters, the Gumbel-Softmax temperature $\tau$ is set to 0.5, and the load-balancing loss coefficient $\lambda_{\text{balance}}$ is set to 0.1.
|
||||
For training, we use the AdamW optimizer with a learning rate of 2e-5, a cosine decay learning rate scheduler, and early stopping based on validation loss to prevent overfitting.
|
||||
\subsection{Overall Performance (RQ1)}
|
||||
To answer \textbf{RQ1}, we evaluate the overall multi-task adaptation performance of HyCAM against the baseline methods.
|
||||
The experiment is conducted across different backbone LLMs of comparable scale (7B/8B parameters) with our comprehensive multi-task datasets.
|
||||
The main results, summarized as the average performance across all tasks, are presented in Table~\ref{tab:exp1}.
|
||||
|
||||
The experimental results demonstrate the superior overall performance of HyCAM.
|
||||
On average, HyCAM achieves a 3.65\% relative improvement across all metrics and backbone LLMs compared to the best baseline, with statistically significant ($p$ < 0.05, indicated by an asterisk~($\ast$) in the table).
|
||||
|
||||
Compared to Full Fine-Tuning, HyCAM achieves excellent performance while only updating a small fraction of the parameters, underscoring its advantage in computational efficiency.
|
||||
Compared with the single-task PEFT method LoRA, HyCAM shows substantial gains. While LoRA provides a parameter-efficient alternative to full fine-tuning, its capacity can be limited in complex multi-task scenarios.
|
||||
HyCAM, with its CAM mechanism and hybrid architecture design, is better able to handle the diverse demands of multiple tasks simultaneously, thus achieving better adaptation performance.
|
||||
|
||||
Furthermore, HyCAM outperforms other multi-task approaches, including Multi LoRA and RieMoE-LoRA.
|
||||
While Multi LoRA allows for parallel task adaptations, it may not facilitate effective knowledge sharing between tasks.
|
||||
RieMoE-LoRA, despite its gradient rescaling mechanism, may struggle with optimal expert utilization.
|
||||
HyCAM's integration of Shared CAM and Specialized CAM, with a dynamic routing strategy, provides a more effective framework for both knowledge sharing and specialized adaptation.
|
||||
|
||||
\vspace{-5px}
|
||||
\subsection{Scalability Analysis (RQ2)}
|
||||
To address \textbf{RQ2} and understand how the effectiveness of HyCAM scales with the size of backbone model, we evaluated its performance across a range of model size within two LLM families: Qwen2.5 (from 0.5B to 14B parameters) and Llama3.2 (1B and 3B parameters), as shown in Table \ref{tab:expdiffqwen}, \ref{tab:expsmallllama}.
|
||||
This analysis aims to determine if the advantages are consistent across different model sizes and whether they become more or less significant with increasing size.
|
||||
|
||||
The results indicate that HyCAM consistently outperforms other PEFT-based methods across all tested model sizes within both the Qwen and Llama families.
|
||||
While Full Fine-Tuning remains a strong baseline, HyCAM consistently offers a more parameter-efficient solution with competitive, and often superior, performance.
|
||||
Moreover, a key observation is that the relative advantage of HyCAM often becomes more obvious as the model size increases. This suggests that larger models, with their greater capacity and more extensive general knowledge, benefit even more from HyCAM's ability to dynamically modulate attention and integrate shared and specialized knowledge effectively.
|
||||
|
||||
\vspace{-5px}
|
||||
\subsection{Task-Specific Analysis (RQ3)}
|
||||
To address RQ3, we analyze the performance on individual tasks within our multi-task dataset.
|
||||
We break down the results obtained with the Llama2-7B backbone model, as presented in Table~\ref{tab:crosstaskresult}.
|
||||
The results indicate that HyCAM generally achieves competitive performance across individual tasks.
|
||||
It is worth noting that performance varies considerably across tasks, due to differences in inherent complexity and task characteristics.
|
||||
In particular, the results on the iClinq and Dolly datasets are significantly lower than those of others.
|
||||
This observation highlights the importance of considering task complexity when designing multi-task learning frameworks for real-world applications.
|
||||
|
||||
\begin{figure}[t]
|
||||
\centering
|
||||
\includegraphics[width=0.9\linewidth]{assets/tsne.pdf}
|
||||
\caption{Scatter plots of attention representations. Higher density indicates improved representation capacity of the attention module.}
|
||||
\label{fig:tSNE}
|
||||
\end{figure}
|
||||
|
||||
\subsection{Ablation and Hyperparameter Analysis (RQ4)}
|
||||
\begin{figure}[tb]
|
||||
\centering
|
||||
\begin{minipage}[t]{.48\columnwidth}
|
||||
\vspace{2pt}
|
||||
\centering
|
||||
\includegraphics[width=0.8\linewidth]{assets/sens_nspec.pdf}
|
||||
\captionsetup{font=small}
|
||||
\captionof{figure}{Impact of Number of Specialized CAMs.}
|
||||
\label{fig:cam_sens}
|
||||
\end{minipage}
|
||||
\hfill
|
||||
\begin{minipage}[t]{.48\columnwidth}
|
||||
\centering
|
||||
\captionsetup{font=small}
|
||||
\captionof{table}{Ablation study of HyCAM on Llama 2 7B.}
|
||||
\vspace{-1pt}
|
||||
\renewcommand{\arraystretch}{1}
|
||||
\resizebox{.8\linewidth}{!}{
|
||||
\begin{tabular}{lc}
|
||||
\toprule
|
||||
Variant & PPL$\downarrow$\\
|
||||
\midrule
|
||||
Shared-CAM-Only & 3.129\\
|
||||
HyCAM-FullSpec & 3.102\\
|
||||
HyCAM-SpecOnly & 3.216\\
|
||||
HyCAM-InversePEFT & 3.129\\
|
||||
\textbf{HyCAM} & \textbf{3.081}\\
|
||||
\bottomrule
|
||||
\end{tabular}}
|
||||
\label{tab:hycam_ablation}
|
||||
\end{minipage}
|
||||
\end{figure}
|
||||
|
||||
For \textbf{RQ4}, we investigate the contributions of HyCAM components and their sensitivity to key hyperparameters.
|
||||
All studies here are conducted on the Llama2-7B model with PPL metric.
|
||||
First, we performed ablation studies by evaluating several variants of HyCAM:
|
||||
\begin{itemize}[leftmargin=*, topsep=0pt]
|
||||
\item \textbf{Shared-CAM-Only}: Only the single, shared full-parameter CAM module, removing all specialized CAMs and the routing mechanism, to assess the baseline impact of CAM.
|
||||
\item \textbf{HyCAM-FullSpec}: Both the shared CAM and all Specialized CAMs are implemented with full parameters, to evaluate the benefit of using PEFT for the specialized components.
|
||||
\item \textbf{HyCAM-SpecOnly}: Removes the shared, full-parameter CAM, relying exclusively on the ensemble of PEFT-based Specialized CAMs managed by the dynamic router and load-balancing loss.
|
||||
\item \textbf{HyCAM-InversePEFT}: A special configuration where the Shared CAM module is implemented using PEFT, while the Specialized CAMs are full-parameter CAM modules, to validate our architectural design for parameter allocation.
|
||||
\end{itemize}
|
||||
Overall, these experiments demonstrate that each component and design choice in HyCAM contributes positively to its overall performance, as shown in Table~\ref{tab:hycam_ablation}.
|
||||
We further analyzed HyCAM's sensitivity to the number of Specialized CAM modules ($N_s$), as illustrated in Figure~\ref{fig:cam_sens}.
|
||||
|
||||
\subsection{Qualitative Evaluation (RQ5)}
|
||||
To answer \textbf{RQ5} and get in-depth insights into the inner mechanisms of HyCAM, we employ several visualization techniques:
|
||||
\begin{itemize}[leftmargin=*, topsep=0pt]
|
||||
\item \textbf{Enhanced Representational Coherence:} We utilize t-SNE to visualize the representations of the value matrix ($V$) within the self-attention module after applying our HyCAM.
|
||||
The vertical axis represents each token, and the horizontal axis indicates the degrees to be enhanced.
|
||||
As shown in Figure~\ref{fig:tSNE}, CAM leads to more coherent clusters in the learned representations compared to a non-modulated baseline. This suggests an improved capacity to form meaningful representations.
|
||||
\item \textbf{Feature Selective Modulation:} To understand how CAM impacts features, we visualize the modulation weight matrix generated by the HyCAM mechanism. As in Figure~\ref{fig:ht}, these weights demonstrate selective amplification of features relevant to the input context, while other features are attenuated. This highlights CAM's ability to perform context-dependent adjustments.
|
||||
\item \textbf{Accelerated Convergence:} Figure~\ref{fig:loss} presents a comparison of training loss curves for HyCAM against baseline methods. These curves illustrate that HyCAM achieves lower loss more rapidly and stably, indicating efficient learning ability.
|
||||
\end{itemize}
|
||||
|
||||
|
||||
\begin{figure}[tb]
|
||||
\centering
|
||||
\begin{minipage}[t]{.45\columnwidth}
|
||||
\vspace{2pt}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{assets/ht.pdf}
|
||||
\captionsetup{font=small}
|
||||
\captionof{figure}{The Weight matrix of HyCAM.}
|
||||
\label{fig:ht}
|
||||
\end{minipage}
|
||||
\hfill
|
||||
\begin{minipage}[t]{.54\columnwidth}
|
||||
\vspace{12pt}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{assets/loss.pdf}
|
||||
\captionsetup{font=small}
|
||||
\captionof{figure}{Comparison of the training processes across different methods.}
|
||||
\label{fig:loss}
|
||||
\end{minipage}
|
||||
\end{figure}
|
||||
|
||||
\section{Related Work}
|
||||
|
||||
\subsection{Parameter-Efficient Fine-Tuning}
|
||||
To improve the efficiency of LLMs and make them more practical for real-world applications~\cite{liu2024moe}, various methods, such as pruning~\cite{wang2025put}, compression~\cite{wang2023large}, and PEFT methods have been proposed.
|
||||
The primary goal of PEFT methods is to adapt LLM to specific tasks by updating only a small fraction of parameters, thereby preserving pre-trained knowledge and significantly reducing computational and memory cost~\cite{han2024parameter, fu2025sliding}.
|
||||
Conventional PEFT strategies include additive methods like adapters, selective fine-tuning, and reparameterization techniques such as low-rank adaptation.
|
||||
Adapters, such as AdapterFusion~\cite{pfeiffer2020adapterfusion}, involve inserting lightweight, task-specific modules into the layers of the pre-trained model.
|
||||
Selective fine-tuning approaches, such as BitFit~\cite{zaken2021bitfit}, demonstrate that even minimal parameter adjustments can be effective for many tasks.
|
||||
Reparameterization methods, such as Low-rank adaptation (LoRA)~\cite{hu2021lora} and its variants like AdaLoRA, DoRA, and MetaLoRA~\cite{zhang2023adalora, liu2024dora, wang2025metalora}, apply rank decomposition into target layers. LoRA operates on the principle that updates to the weight matrices during adaptation possess a low intrinsic rank.
|
||||
Despite their success in reducing computational demands, existing PEFT methods often perform suboptimally in complex multi-task scenarios. A key challenge is balancing knowledge retention with task-specific specialization across multiple, potentially conflicting, objectives. Many PEFT approaches may exhibit limited generalization and representational capacity across diverse tasks, or suffer from potential interference when adapted to multiple objectives simultaneously.
|
||||
|
||||
\subsection{Multi-task Adaptation Methods}
|
||||
Multi-task learning (MTL) is fundamental across many real-world domains, including recommendation systems~\cite{liu2023multi, liu2025multi}, environmental prediction~\cite{hettige2024airphynet, han2025bridging, ji2023multi}, and heterogeneous time-series analysis~\cite{du2021gan,wang2023wavelet}.
|
||||
Within LLMs, MTL aims to enhance generalization by sharing knowledge across tasks~\cite{guo2024large, zhao2023survey, wang2024llm4msr}, but a key challenge lies in balancing shared knowledge with task-specific specialization~\cite{gao2024higher}.
|
||||
Several strategies have been explored for MTL in LLMs. A common paradigm involves hard parameter sharing, such as T5~\cite{raffel2020exploring}.
|
||||
PEFT techniques have been adapted for MTL. For instance, methods like Multi-LoRA~\cite{wang2023multilora} and adapter-based methods~\cite{pfeiffer2020mad} use small trainable modules into the frozen pre-trained model.
|
||||
Another type of multi-task method is the Mixture-of-Experts (MoE) framework, such as Switch Transformers~\cite{fedus2022switch} and GShard~\cite{lepikhin2020gshard}, which employ a routing mechanism that activates a subset of experts for each input. MoE-LoRA~\cite{luo2024moelora} introduces Layer-wise Expert Allocation (MoLA), which assigns experts at different Transformer layers for better adaptability.
|
||||
While powerful, MoE-based approaches often face challenges such as expert load imbalance, which influences training efficiency.
|
||||
Our proposed HyCAM framework offers a novel approach to these challenges by combining CAM with a hybrid strategy for knowledge sharing and specialization, enabling efficient knowledge sharing while preserving task-specific adaptations.
|
||||
|
||||
\section{Conclusion}
|
||||
In this work, we propose HyCAM, a novel framework designed to enhance multi-task adaptation in Large Language Models by integrating our core CAM mechanism within a hybrid architecture.
|
||||
HyCAM effectively balances generalization and specialization by employing a shared, full-parameter CAM module for broad knowledge retention and multiple specialized, lightweight CAM modules for fine-grained, task-specific feature enhancement. Our dynamic soft-routing strategy, with a load-balancing loss, ensures adaptive knowledge fusion and efficient utilization of these specialized components.
|
||||
Extensive experiments show that HyCAM significantly outperforms existing approaches, demonstrating its ability for efficient adaptation and preserving pre-trained general knowledge.
|
||||
828
mypaper/IJCAI2026_CASCADE.bib
Executable file
828
mypaper/IJCAI2026_CASCADE.bib
Executable file
@@ -0,0 +1,828 @@
|
||||
% Related
|
||||
|
||||
@article{lialin2023scaling,
|
||||
title={Scaling down to scale up: A guide to parameter-efficient fine-tuning},
|
||||
author={Lialin, Vladislav and Deshpande, Vijeta and Rumshisky, Anna},
|
||||
journal={arXiv preprint arXiv:2303.15647},
|
||||
year={2023}
|
||||
}
|
||||
% SDCTFT
|
||||
@article{shen2024parameter,
|
||||
title={Parameter-efficient fine-tuning via selective discrete cosine transform},
|
||||
author={Shen, Yixian and Bi, Qi and Huang, Jia-Hong and Zhu, Hongyi and Pathania, Anuj},
|
||||
journal={arXiv preprint arXiv:2410.09103},
|
||||
year={2024}
|
||||
}
|
||||
|
||||
|
||||
% Freq
|
||||
@article{gao2024parameter,
|
||||
title={Parameter-efficient fine-tuning with discrete fourier transform},
|
||||
author={Gao, Ziqi and Wang, Qichao and Chen, Aochuan and Liu, Zijing and Wu, Bingzhe and Chen, Liang and Li, Jia},
|
||||
journal={arXiv preprint arXiv:2405.03003},
|
||||
year={2024}
|
||||
}
|
||||
@article{hu2025waveletft,
|
||||
title={WaveletFT: Discrete wavelet transform for parameter-efficient fine-tuning},
|
||||
author={Hu, Can and Yang, Jie and Song, Shien and Fan, Wentao and Xie, Tao},
|
||||
journal={Neurocomputing},
|
||||
pages={130765},
|
||||
year={2025},
|
||||
publisher={Elsevier}
|
||||
}
|
||||
% Little Wavelet
|
||||
@article{bilican2025exploring,
|
||||
title={Exploring Sparsity for Parameter Efficient Fine Tuning Using Wavelets},
|
||||
author={Bilican, Ahmet and Y{\i}lmaz, M Ak{\i}n and Tekalp, A Murat and Cinbi{\c{s}}, R G{\"o}kberk},
|
||||
journal={arXiv preprint arXiv:2505.12532},
|
||||
year={2025}
|
||||
}
|
||||
@article{zhang2025f,
|
||||
title={F-Adapter: Frequency-Adaptive Parameter-Efficient Fine-Tuning in Scientific Machine Learning},
|
||||
author={Zhang, Hangwei and Kang, Chun and Wang, Yan and Zou, Difan},
|
||||
journal={arXiv preprint arXiv:2509.23173},
|
||||
year={2025}
|
||||
}
|
||||
|
||||
|
||||
% LoCA
|
||||
@article{du2025loca,
|
||||
title={LoCA: Location-Aware Cosine Adaptation for Parameter-Efficient Fine-Tuning},
|
||||
author={Du, Zhekai and Min, Yinjie and Li, Jingjing and Lu, Ke and Zou, Changliang and Peng, Liuhua and Chu, Tingjin and Gong, Mingming},
|
||||
journal={arXiv preprint arXiv:2502.06820},
|
||||
year={2025}
|
||||
}
|
||||
% Flylora
|
||||
@article{zou2025flylora,
|
||||
title={FlyloRA: Boosting task decoupling and parameter efficiency via implicit rank-wise mixture-of-experts},
|
||||
author={Zou, Heming and Zang, Yunliang and Xu, Wutong and Zhu, Yao and Ji, Xiangyang},
|
||||
journal={arXiv preprint arXiv:2510.08396},
|
||||
year={2025}
|
||||
}
|
||||
|
||||
|
||||
% LLM
|
||||
@misc{qwen3technicalreport,
|
||||
title={Qwen3 Technical Report},
|
||||
author={Qwen Team},
|
||||
year={2025},
|
||||
eprint={2505.09388},
|
||||
archivePrefix={arXiv},
|
||||
primaryClass={cs.CL},
|
||||
url={https://arxiv.org/abs/2505.09388},
|
||||
}
|
||||
@article{grattafiori2024llama,
|
||||
title={The llama 3 herd of models},
|
||||
author={Grattafiori, Aaron and Dubey, Abhimanyu and Jauhri, Abhinav and Pandey, Abhinav and Kadian, Abhishek and Al-Dahle, Ahmad and Letman, Aiesha and Mathur, Akhil and Schelten, Alan and Vaughan, Alex and others},
|
||||
journal={arXiv preprint arXiv:2407.21783},
|
||||
year={2024}
|
||||
}
|
||||
@article{gemma_2025,
|
||||
title={Gemma 3},
|
||||
url={https://goo.gle/Gemma3Report},
|
||||
publisher={Kaggle},
|
||||
author={Gemma Team},
|
||||
year={2025}
|
||||
}
|
||||
|
||||
% IJCAI
|
||||
@article{han2024parameter,
|
||||
title={Parameter-efficient fine-tuning for large models: A comprehensive survey},
|
||||
author={Han, Zeyu and Gao, Chao and Liu, Jinyang and Zhang, Jeff and Zhang, Sai Qian},
|
||||
journal={arXiv preprint arXiv:2403.14608},
|
||||
year={2024}
|
||||
}
|
||||
@inproceedings{shiracite,
|
||||
author = {Bhardwaj, Kartikeya and Pandey, Nilesh Prasad and Priyadarshi, Sweta and Ganapathy, Viswanath and Kadambi, Shreya and Esteves, Rafael and Borse, Shubhankar and Whatmough, Paul and Garrepalli, Risheek and Van Baalen, Mart and Teague, Harris and Nagel, Markus},
|
||||
title = {Sparse high rank adapters},
|
||||
year = {2024},
|
||||
isbn = {9798331314385},
|
||||
publisher = {Curran Associates Inc.},
|
||||
address = {Red Hook, NY, USA},
|
||||
booktitle = {Proceedings of the 38th International Conference on Neural Information Processing Systems},
|
||||
articleno = {438},
|
||||
numpages = {31},
|
||||
location = {Vancouver, BC, Canada},
|
||||
series = {NIPS '24}
|
||||
}
|
||||
@article{hu2021lora,
|
||||
title={Lora: Low-rank adaptation of large language models},
|
||||
author={Hu, Edward J and Shen, Yelong and Wallis, Phillip and Allen-Zhu, Zeyuan and Li, Yuanzhi and Wang, Shean and Wang, Lu and Chen, Weizhu},
|
||||
journal={arXiv preprint arXiv:2106.09685},
|
||||
year={2021}
|
||||
}
|
||||
|
||||
% adapter
|
||||
@inproceedings{houlsby2019parameter,
|
||||
title={Parameter-efficient transfer learning for NLP},
|
||||
author={Houlsby, Neil and Giurgiu, Andrei and Jastrzebski, Stanislaw and Morrone, Bruna and De Laroussilhe, Quentin and Gesmundo, Andrea and Attariyan, Mona and Gelly, Sylvain},
|
||||
booktitle={International conference on machine learning},
|
||||
pages={2790--2799},
|
||||
year={2019},
|
||||
organization={PMLR}
|
||||
}
|
||||
|
||||
% AAAING
|
||||
|
||||
% Datasets
|
||||
% GSM8K
|
||||
@article{cobbe2021training,
|
||||
title={Training verifiers to solve math word problems},
|
||||
author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and others},
|
||||
journal={arXiv preprint arXiv:2110.14168},
|
||||
year={2021}
|
||||
}
|
||||
% SVAMP
|
||||
@article{patel2021nlp,
|
||||
title={Are NLP models really able to solve simple math word problems?},
|
||||
author={Patel, Arkil and Bhattamishra, Satwik and Goyal, Navin},
|
||||
journal={arXiv preprint arXiv:2103.07191},
|
||||
year={2021}
|
||||
}
|
||||
% MultiArith
|
||||
@article{roy2016solving,
|
||||
title={Solving general arithmetic word problems},
|
||||
author={Roy, Subhro and Roth, Dan},
|
||||
journal={arXiv preprint arXiv:1608.01413},
|
||||
year={2016}
|
||||
}
|
||||
% Addsub
|
||||
@inproceedings{hosseini2014learning,
|
||||
title={Learning to solve arithmetic word problems with verb categorization},
|
||||
author={Hosseini, Mohammad Javad and Hajishirzi, Hannaneh and Etzioni, Oren and Kushman, Nate},
|
||||
booktitle={Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP)},
|
||||
pages={523--533},
|
||||
year={2014}
|
||||
}
|
||||
% AQuA
|
||||
@article{ling2017program,
|
||||
title={Program induction by rationale generation: Learning to solve and explain algebraic word problems},
|
||||
author={Ling, Wang and Yogatama, Dani and Dyer, Chris and Blunsom, Phil},
|
||||
journal={arXiv preprint arXiv:1705.04146},
|
||||
year={2017}
|
||||
}
|
||||
% SingleEq
|
||||
@article{koncel2015parsing,
|
||||
title={Parsing algebraic word problems into equations},
|
||||
author={Koncel-Kedziorski, Rik and Hajishirzi, Hannaneh and Sabharwal, Ashish and Etzioni, Oren and Ang, Siena Dumas},
|
||||
journal={Transactions of the Association for Computational Linguistics},
|
||||
volume={3},
|
||||
pages={585--597},
|
||||
year={2015},
|
||||
publisher={MIT Press One Rogers Street, Cambridge, MA 02142-1209, USA journals-info~…}
|
||||
}
|
||||
% MAWPS
|
||||
@inproceedings{koncel2016mawps,
|
||||
title={MAWPS: A math word problem repository},
|
||||
author={Koncel-Kedziorski, Rik and Roy, Subhro and Amini, Aida and Kushman, Nate and Hajishirzi, Hannaneh},
|
||||
booktitle={Proceedings of the 2016 conference of the north american chapter of the association for computational linguistics: human language technologies},
|
||||
pages={1152--1157},
|
||||
year={2016}
|
||||
}
|
||||
% BoolQ
|
||||
@article{clark2019boolq,
|
||||
title={Boolq: Exploring the surprising difficulty of natural yes/no questions},
|
||||
author={Clark, Christopher and Lee, Kenton and Chang, Ming-Wei and Kwiatkowski, Tom and Collins, Michael and Toutanova, Kristina},
|
||||
journal={arXiv preprint arXiv:1905.10044},
|
||||
year={2019}
|
||||
}
|
||||
% PIQA
|
||||
@inproceedings{bisk2020piqa,
|
||||
title={Piqa: Reasoning about physical commonsense in natural language},
|
||||
author={Bisk, Yonatan and Zellers, Rowan and Gao, Jianfeng and Choi, Yejin and others},
|
||||
booktitle={Proceedings of the AAAI conference on artificial intelligence},
|
||||
volume={34},
|
||||
number={05},
|
||||
pages={7432--7439},
|
||||
year={2020}
|
||||
}
|
||||
% SIQA
|
||||
@article{sap2019socialiqa,
|
||||
title={Socialiqa: Commonsense reasoning about social interactions},
|
||||
author={Sap, Maarten and Rashkin, Hannah and Chen, Derek and LeBras, Ronan and Choi, Yejin},
|
||||
journal={arXiv preprint arXiv:1904.09728},
|
||||
year={2019}
|
||||
}
|
||||
% HW
|
||||
@article{zellers2019hellaswag,
|
||||
title={Hellaswag: Can a machine really finish your sentence?},
|
||||
author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin},
|
||||
journal={arXiv preprint arXiv:1905.07830},
|
||||
year={2019}
|
||||
}
|
||||
% WN
|
||||
@inproceedings{sakaguchi2020winogrande,
|
||||
title={Winogrande: An adversarial winograd schema challenge at scale},
|
||||
author={Sakaguchi, Keisuke and Le Bras, Ronan and Bhagavatula, Chandra and Choi, Yejin},
|
||||
booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
|
||||
volume={34},
|
||||
number={05},
|
||||
pages={8732--8740},
|
||||
year={2020}
|
||||
}
|
||||
% ARC
|
||||
@article{clark2018think,
|
||||
title={Think you have solved question answering? try arc, the ai2 reasoning challenge},
|
||||
author={Clark, Peter and Cowhey, Isaac and Etzioni, Oren and Khot, Tushar and Sabharwal, Ashish and Schoenick, Carissa and Tafjord, Oyvind},
|
||||
journal={arXiv preprint arXiv:1803.05457},
|
||||
year={2018}
|
||||
}
|
||||
% OBDA
|
||||
@article{mihaylov2018can,
|
||||
title={Can a suit of armor conduct electricity? a new dataset for open book question answering},
|
||||
author={Mihaylov, Todor and Clark, Peter and Khot, Tushar and Sabharwal, Ashish},
|
||||
journal={arXiv preprint arXiv:1809.02789},
|
||||
year={2018}
|
||||
}
|
||||
|
||||
% Related
|
||||
|
||||
@article{li2021prefix,
|
||||
title={Prefix-tuning: Optimizing continuous prompts for generation},
|
||||
author={Li, Xiang Lisa and Liang, Percy},
|
||||
journal={arXiv preprint arXiv:2101.00190},
|
||||
year={2021}
|
||||
}
|
||||
@article{dong2025attention,
|
||||
title={Attention Retrieves, MLP Memorizes: Disentangling Trainable Components in the Transformer},
|
||||
author={Dong, Yihe and Noci, Lorenzo and Khodak, Mikhail and Li, Mufan},
|
||||
journal={arXiv preprint arXiv:2506.01115},
|
||||
year={2025}
|
||||
}
|
||||
@article{michel2019sixteen,
|
||||
title={Are sixteen heads really better than one?},
|
||||
author={Michel, Paul and Levy, Omer and Neubig, Graham},
|
||||
journal={Advances in neural information processing systems},
|
||||
volume={32},
|
||||
year={2019}
|
||||
}
|
||||
@article{belinkov2018evaluating,
|
||||
title={Evaluating layers of representation in neural machine translation on part-of-speech and semantic tagging tasks},
|
||||
author={Belinkov, Yonatan and M{\`a}rquez, Llu{\'\i}s and Sajjad, Hassan and Durrani, Nadir and Dalvi, Fahim and Glass, James},
|
||||
journal={arXiv preprint arXiv:1801.07772},
|
||||
year={2018}
|
||||
}
|
||||
% Others
|
||||
@article{ding2023parameter,
|
||||
title={Parameter-efficient fine-tuning of large-scale pre-trained language models},
|
||||
author={Ding, Ning and Qin, Yujia and Yang, Guang and Wei, Fuchao and Yang, Zonghan and Su, Yusheng and Hu, Shengding and Chen, Yulin and Chan, Chi-Min and Chen, Weize and others},
|
||||
journal={Nature machine intelligence},
|
||||
volume={5},
|
||||
number={3},
|
||||
pages={220--235},
|
||||
year={2023},
|
||||
publisher={Nature Publishing Group UK London}
|
||||
}
|
||||
@article{peng2023instruction,
|
||||
title={Instruction tuning with gpt-4},
|
||||
author={Peng, Baolin and Li, Chunyuan and He, Pengcheng and Galley, Michel and Gao, Jianfeng},
|
||||
journal={arXiv preprint arXiv:2304.03277},
|
||||
year={2023}
|
||||
}
|
||||
|
||||
% Baselines
|
||||
@article{liu2024dora,
|
||||
title={Dora: Weight-decomposed low-rank adaptation},
|
||||
author={Liu, Shih-Yang and Wang, Chien-Yi and Yin, Hongxu and Molchanov, Pavlo and Wang, Yu-Chiang Frank and Cheng, Kwang-Ting and Chen, Min-Hung},
|
||||
journal={arXiv preprint arXiv:2402.09353},
|
||||
year={2024}
|
||||
}
|
||||
|
||||
@article{zhang2023adalora,
|
||||
title={Adalora: Adaptive budget allocation for parameter-efficient fine-tuning},
|
||||
author={Zhang, Qingru and Chen, Minshuo and Bukharin, Alexander and Karampatziakis, Nikos and He, Pengcheng and Cheng, Yu and Chen, Weizhu and Zhao, Tuo},
|
||||
journal={arXiv preprint arXiv:2303.10512},
|
||||
year={2023}
|
||||
}
|
||||
% C3A
|
||||
@article{chen2024parameter,
|
||||
title={Parameter-efficient fine-tuning via circular convolution},
|
||||
author={Chen, Aochuan and Cheng, Jiashun and Liu, Zijing and Gao, Ziqi and Tsung, Fugee and Li, Yu and Li, Jia},
|
||||
journal={arXiv preprint arXiv:2407.19342},
|
||||
year={2024}
|
||||
}
|
||||
% BONE
|
||||
@article{kang2024balancing,
|
||||
title={Balancing LoRA Performance and Efficiency with Simple Shard Sharing},
|
||||
author={Kang, Jiale and Yin, Qingyu},
|
||||
journal={arXiv preprint arXiv:2409.15371},
|
||||
year={2024}
|
||||
}
|
||||
% VERA-EDITED
|
||||
@article{kopiczko2023vera,
|
||||
title={Vera: Vector-based random matrix adaptation},
|
||||
author={{Kopiczko et al.}},
|
||||
journal={arXiv preprint arXiv:2310.11454},
|
||||
year={2023}
|
||||
}
|
||||
% BOFT
|
||||
@article{liu2023parameter,
|
||||
title={Parameter-efficient orthogonal finetuning via butterfly factorization},
|
||||
author={Liu, Weiyang and Qiu, Zeju and Feng, Yao and Xiu, Yuliang and Xue, Yuxuan and Yu, Longhui and Feng, Haiwen and Liu, Zhen and Heo, Juyeon and Peng, Songyou and others},
|
||||
journal={arXiv preprint arXiv:2311.06243},
|
||||
year={2023}
|
||||
}
|
||||
% LN-Tuning
|
||||
@article{zhao2023tuning,
|
||||
title={Tuning layernorm in attention: Towards efficient multi-modal llm finetuning},
|
||||
author={Zhao, Bingchen and Tu, Haoqin and Wei, Chen and Mei, Jieru and Xie, Cihang},
|
||||
journal={arXiv preprint arXiv:2312.11420},
|
||||
year={2023}
|
||||
}
|
||||
|
||||
% Deepspeed
|
||||
@inproceedings{rasley2020deepspeed,
|
||||
title={Deepspeed: System optimizations enable training deep learning models with over 100 billion parameters},
|
||||
author={Rasley, Jeff and Rajbhandari, Samyam and Ruwase, Olatunji and He, Yuxiong},
|
||||
booktitle={Proceedings of the 26th ACM SIGKDD international conference on knowledge discovery \& data mining},
|
||||
pages={3505--3506},
|
||||
year={2020}
|
||||
}
|
||||
% Huggingface Transformers
|
||||
@inproceedings{wolf2020transformers,
|
||||
title={Transformers: State-of-the-art natural language processing},
|
||||
author={Wolf, Thomas and Debut, Lysandre and Sanh, Victor and Chaumond, Julien and Delangue, Clement and Moi, Anthony and Cistac, Pierric and Rault, Tim and Louf, Remi and Funtowicz, Morgan and others},
|
||||
booktitle={Proceedings of the 2020 conference on empirical methods in natural language processing: system demonstrations},
|
||||
pages={38--45},
|
||||
year={2020}
|
||||
}
|
||||
|
||||
@inproceedings{geva2021transformer,
|
||||
title={Transformer Feed-Forward Layers Are Key-Value Memories},
|
||||
author={Geva, Mor and Schuster, Roei and Berant, Jonathan and Levy, Omer},
|
||||
booktitle={Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing},
|
||||
pages={5484--5495},
|
||||
year={2021}
|
||||
}
|
||||
|
||||
@article{su2024roformer,
|
||||
title={Roformer: Enhanced transformer with rotary position embedding},
|
||||
author={Su, Jianlin and Ahmed, Murtadha and Lu, Yu and Pan, Shengfeng and Bo, Wen and Liu, Yunfeng},
|
||||
journal={Neurocomputing},
|
||||
volume={568},
|
||||
pages={127063},
|
||||
year={2024},
|
||||
publisher={Elsevier}
|
||||
}
|
||||
@article{barbero2024round,
|
||||
title={Round and round we go! what makes rotary positional encodings useful?},
|
||||
author={Barbero, Federico and Vitvitskyi, Alex and Perivolaropoulos, Christos and Pascanu, Razvan and Veli{\v{c}}kovi{\'c}, Petar},
|
||||
journal={arXiv preprint arXiv:2410.06205},
|
||||
year={2024}
|
||||
}
|
||||
|
||||
@article{jin2025massive,
|
||||
title={Massive Values in Self-Attention Modules are the Key to Contextual Knowledge Understanding},
|
||||
author={Jin, Mingyu and Mei, Kai and Xu, Wujiang and Sun, Mingjie and Tang, Ruixiang and Du, Mengnan and Liu, Zirui and Zhang, Yongfeng},
|
||||
journal={arXiv preprint arXiv:2502.01563},
|
||||
year={2025}
|
||||
}
|
||||
@article{vaswani2017attention,
|
||||
title={Attention is all you need},
|
||||
author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
|
||||
journal={Advances in neural information processing systems},
|
||||
volume={30},
|
||||
year={2017}
|
||||
}
|
||||
@article{touvron2023llama,
|
||||
title={Llama: Open and efficient foundation language models},
|
||||
author={Touvron, Hugo and Lavril, Thibaut and Izacard, Gautier and Martinet, Xavier and Lachaux, Marie-Anne and Lacroix, Timoth{\'e}e and Rozi{\`e}re, Baptiste and Goyal, Naman and Hambro, Eric and Azhar, Faisal and others},
|
||||
journal={arXiv preprint arXiv:2302.13971},
|
||||
year={2023}
|
||||
}
|
||||
@article{shazeer2020glu,
|
||||
title={Glu variants improve transformer},
|
||||
author={Shazeer, Noam},
|
||||
journal={arXiv preprint arXiv:2002.05202},
|
||||
year={2020}
|
||||
}
|
||||
@inproceedings{he2016deep,
|
||||
title={Deep residual learning for image recognition},
|
||||
author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
|
||||
booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
|
||||
pages={770--778},
|
||||
year={2016}
|
||||
}
|
||||
@article{bai2023qwen,
|
||||
title={Qwen technical report},
|
||||
author={Bai, Jinze and Bai, Shuai and Chu, Yunfei and Cui, Zeyu and Dang, Kai and Deng, Xiaodong and Fan, Yang and Ge, Wenbin and Han, Yu and Huang, Fei and others},
|
||||
journal={arXiv preprint arXiv:2309.16609},
|
||||
year={2023}
|
||||
}
|
||||
|
||||
% SiLU
|
||||
@article{elfwing2018sigmoid,
|
||||
title={Sigmoid-weighted linear units for neural network function approximation in reinforcement learning},
|
||||
author={Elfwing, Stefan and Uchibe, Eiji and Doya, Kenji},
|
||||
journal={Neural networks},
|
||||
volume={107},
|
||||
pages={3--11},
|
||||
year={2018},
|
||||
publisher={Elsevier}
|
||||
}
|
||||
@article{ainslie2023gqa,
|
||||
title={Gqa: Training generalized multi-query transformer models from multi-head checkpoints},
|
||||
author={Ainslie, Joshua and Lee-Thorp, James and De Jong, Michiel and Zemlyanskiy, Yury and Lebr{\'o}n, Federico and Sanghai, Sumit},
|
||||
journal={arXiv preprint arXiv:2305.13245},
|
||||
year={2023}
|
||||
}
|
||||
@article{voita2019bottom,
|
||||
title={The bottom-up evolution of representations in the transformer: A study with machine translation and language modeling objectives},
|
||||
author={Voita, Elena and Sennrich, Rico and Titov, Ivan},
|
||||
journal={arXiv preprint arXiv:1909.01380},
|
||||
year={2019}
|
||||
}
|
||||
@article{hu2023llm,
|
||||
title={Llm-adapters: An adapter family for parameter-efficient fine-tuning of large language models},
|
||||
author={Hu, Zhiqiang and Wang, Lei and Lan, Yihuai and Xu, Wanyu and Lim, Ee-Peng and Bing, Lidong and Xu, Xing and Poria, Soujanya and Lee, Roy Ka-Wei},
|
||||
journal={arXiv preprint arXiv:2304.01933},
|
||||
year={2023}
|
||||
}
|
||||
@article{team2024gemma,
|
||||
title={Gemma 2: Improving open language models at a practical size},
|
||||
author={Team, Gemma and Riviere, Morgane and Pathak, Shreya and Sessa, Pier Giuseppe and Hardin, Cassidy and Bhupatiraju, Surya and Hussenot, L{\'e}onard and Mesnard, Thomas and Shahriari, Bobak and Ram{\'e}, Alexandre and others},
|
||||
journal={arXiv preprint arXiv:2408.00118},
|
||||
year={2024}
|
||||
}
|
||||
@article{dubey2024llama,
|
||||
title={The llama 3 herd of models},
|
||||
author={Dubey, Abhimanyu and Jauhri, Abhinav and Pandey, Abhinav and Kadian, Abhishek and Al-Dahle, Ahmad and Letman, Aiesha and Mathur, Akhil and Schelten, Alan and Yang, Amy and Fan, Angela and others},
|
||||
journal={arXiv e-prints},
|
||||
pages={arXiv--2407},
|
||||
year={2024}
|
||||
}
|
||||
@article{team2024qwen2,
|
||||
title={Qwen2 technical report},
|
||||
author={Team, Qwen},
|
||||
journal={arXiv preprint arXiv:2407.10671},
|
||||
year={2024}
|
||||
}
|
||||
% Old
|
||||
|
||||
@article{sun2025stronger,
|
||||
title={A Stronger Mixture of Low-Rank Experts for Fine-Tuning Foundation Models},
|
||||
author={Sun, Mengyang and Wang, Yihao and Feng, Tao and Zhang, Dan and Zhu, Yifan and Tang, Jie},
|
||||
journal={arXiv preprint arXiv:2502.15828},
|
||||
year={2025}
|
||||
}
|
||||
@article{pfeiffer2020mad,
|
||||
title={Mad-x: An adapter-based framework for multi-task cross-lingual transfer},
|
||||
author={Pfeiffer, Jonas and Vuli{\'c}, Ivan and Gurevych, Iryna and Ruder, Sebastian},
|
||||
journal={arXiv preprint arXiv:2005.00052},
|
||||
year={2020}
|
||||
}
|
||||
@article{raffel2020exploring,
|
||||
title={Exploring the limits of transfer learning with a unified text-to-text transformer},
|
||||
author={Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J},
|
||||
journal={Journal of machine learning research},
|
||||
volume={21},
|
||||
number={140},
|
||||
pages={1--67},
|
||||
year={2020}
|
||||
}
|
||||
@article{zaken2021bitfit,
|
||||
title={Bitfit: Simple parameter-efficient fine-tuning for transformer-based masked language-models},
|
||||
author={Zaken, Elad Ben and Ravfogel, Shauli and Goldberg, Yoav},
|
||||
journal={arXiv preprint arXiv:2106.10199},
|
||||
year={2021}
|
||||
}
|
||||
@inproceedings{papineni2002bleu,
|
||||
title={Bleu: a method for automatic evaluation of machine translation},
|
||||
author={Papineni, Kishore and Roukos, Salim and Ward, Todd and Zhu, Wei-Jing},
|
||||
booktitle={Proceedings of the 40th annual meeting of the Association for Computational Linguistics},
|
||||
pages={311--318},
|
||||
year={2002}
|
||||
}
|
||||
@inproceedings{lin2004rouge,
|
||||
title={Rouge: A package for automatic evaluation of summaries},
|
||||
author={Lin, Chin-Yew},
|
||||
booktitle={Text summarization branches out},
|
||||
pages={74--81},
|
||||
year={2004}
|
||||
}
|
||||
@article{jang2016categorical,
|
||||
title={Categorical reparameterization with gumbel-softmax},
|
||||
author={Jang, Eric and Gu, Shixiang and Poole, Ben},
|
||||
journal={arXiv preprint arXiv:1611.01144},
|
||||
year={2016}
|
||||
}
|
||||
@inproceedings{he2015delving,
|
||||
title={Delving deep into rectifiers: Surpassing human-level performance on imagenet classification},
|
||||
author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
|
||||
booktitle={Proceedings of the IEEE international conference on computer vision},
|
||||
pages={1026--1034},
|
||||
year={2015}
|
||||
}
|
||||
@article{guo2025nlora,
|
||||
title={NLoRA: Nystr$\backslash$" om-Initiated Low-Rank Adaptation for Large Language Models},
|
||||
author={Guo, Chenlu and Wu, Yuan and Chang, Yi},
|
||||
journal={arXiv preprint arXiv:2502.14482},
|
||||
year={2025}
|
||||
}
|
||||
|
||||
@article{ba2016layer,
|
||||
title={Layer normalization},
|
||||
author={Ba, Jimmy Lei and Kiros, Jamie Ryan and Hinton, Geoffrey E},
|
||||
journal={arXiv preprint arXiv:1607.06450},
|
||||
year={2016}
|
||||
}
|
||||
|
||||
@article{team2023gemini,
|
||||
title={Gemini: a family of highly capable multimodal models},
|
||||
author={Team, Gemini and Anil, Rohan and Borgeaud, Sebastian and Alayrac, Jean-Baptiste and Yu, Jiahui and Soricut, Radu and Schalkwyk, Johan and Dai, Andrew M and Hauth, Anja and Millican, Katie and others},
|
||||
journal={arXiv preprint arXiv:2312.11805},
|
||||
year={2023}
|
||||
}
|
||||
@article{liu2023moelora,
|
||||
title={Moelora: An moe-based parameter efficient fine-tuning method for multi-task medical applications},
|
||||
author={Liu, Qidong and Wu, Xian and Zhao, Xiangyu and Zhu, Yuanshao and Xu, Derong and Tian, Feng and Zheng, Yefeng},
|
||||
journal={arXiv preprint arXiv:2310.18339},
|
||||
year={2023}
|
||||
}
|
||||
@article{wang2023multilora,
|
||||
title={Multilora: Democratizing lora for better multi-task learning},
|
||||
author={Wang, Yiming and Lin, Yu and Zeng, Xiaodong and Zhang, Guannan},
|
||||
journal={arXiv preprint arXiv:2311.11501},
|
||||
year={2023}
|
||||
}
|
||||
@article{liu2021p,
|
||||
title={P-tuning v2: Prompt tuning can be comparable to fine-tuning universally across scales and tasks},
|
||||
author={Liu, Xiao and Ji, Kaixuan and Fu, Yicheng and Tam, Weng Lam and Du, Zhengxiao and Yang, Zhilin and Tang, Jie},
|
||||
journal={arXiv preprint arXiv:2110.07602},
|
||||
year={2021}
|
||||
}
|
||||
@article{brown2020language,
|
||||
title={Language models are few-shot learners},
|
||||
author={Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and others},
|
||||
journal={Advances in neural information processing systems},
|
||||
volume={33},
|
||||
pages={1877--1901},
|
||||
year={2020}
|
||||
}
|
||||
@article{liu2021conflict,
|
||||
title={Conflict-averse gradient descent for multi-task learning},
|
||||
author={Liu, Bo and Liu, Xingchao and Jin, Xiaojie and Stone, Peter and Liu, Qiang},
|
||||
journal={Advances in Neural Information Processing Systems},
|
||||
volume={34},
|
||||
pages={18878--18890},
|
||||
year={2021}
|
||||
}
|
||||
@article{navon2022multi,
|
||||
title={Multi-task learning as a bargaining game},
|
||||
author={Navon, Aviv and Shamsian, Aviv and Achituve, Idan and Maron, Haggai and Kawaguchi, Kenji and Chechik, Gal and Fetaya, Ethan},
|
||||
journal={arXiv preprint arXiv:2202.01017},
|
||||
year={2022}
|
||||
}
|
||||
@article{yu2020gradient,
|
||||
title={Gradient surgery for multi-task learning},
|
||||
author={Yu, Tianhe and Kumar, Saurabh and Gupta, Abhishek and Levine, Sergey and Hausman, Karol and Finn, Chelsea},
|
||||
journal={Advances in Neural Information Processing Systems},
|
||||
volume={33},
|
||||
pages={5824--5836},
|
||||
year={2020}
|
||||
}
|
||||
@article{renduchintala2023tied,
|
||||
title={Tied-lora: Enhacing parameter efficiency of lora with weight tying},
|
||||
author={Renduchintala, Adithya and Konuk, Tugrul and Kuchaiev, Oleksii},
|
||||
journal={arXiv preprint arXiv:2311.09578},
|
||||
year={2023}
|
||||
}
|
||||
@inproceedings{kwon2023efficient,
|
||||
title={Efficient memory management for large language model serving with pagedattention},
|
||||
author={Kwon, Woosuk and Li, Zhuohan and Zhuang, Siyuan and Sheng, Ying and Zheng, Lianmin and Yu, Cody Hao and Gonzalez, Joseph and Zhang, Hao and Stoica, Ion},
|
||||
booktitle={Proceedings of the 29th Symposium on Operating Systems Principles},
|
||||
pages={611--626},
|
||||
year={2023}
|
||||
}
|
||||
|
||||
@article{dai2024deepseekmoe,
|
||||
title={Deepseekmoe: Towards ultimate expert specialization in mixture-of-experts language models},
|
||||
author={Dai, Damai and Deng, Chengqi and Zhao, Chenggang and Xu, RX and Gao, Huazuo and Chen, Deli and Li, Jiashi and Zeng, Wangding and Yu, Xingkai and Wu, Y and others},
|
||||
journal={arXiv preprint arXiv:2401.06066},
|
||||
year={2024}
|
||||
}
|
||||
@article{guo2025deepseek,
|
||||
title={Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning},
|
||||
author={Guo, Daya and Yang, Dejian and Zhang, Haowei and Song, Junxiao and Zhang, Ruoyu and Xu, Runxin and Zhu, Qihao and Ma, Shirong and Wang, Peiyi and Bi, Xiao and others},
|
||||
journal={arXiv preprint arXiv:2501.12948},
|
||||
year={2025}
|
||||
}
|
||||
@article{shazeer2017outrageously,
|
||||
title={Outrageously large neural networks: The sparsely-gated mixture-of-experts layer},
|
||||
author={Shazeer, Noam and Mirhoseini, Azalia and Maziarz, Krzysztof and Davis, Andy and Le, Quoc and Hinton, Geoffrey and Dean, Jeff},
|
||||
journal={arXiv preprint arXiv:1701.06538},
|
||||
year={2017}
|
||||
}
|
||||
@inproceedings{rajbhandari2022deepspeed,
|
||||
title={Deepspeed-moe: Advancing mixture-of-experts inference and training to power next-generation ai scale},
|
||||
author={Rajbhandari, Samyam and Li, Conglong and Yao, Zhewei and Zhang, Minjia and Aminabadi, Reza Yazdani and Awan, Ammar Ahmad and Rasley, Jeff and He, Yuxiong},
|
||||
booktitle={International conference on machine learning},
|
||||
pages={18332--18346},
|
||||
year={2022},
|
||||
organization={PMLR}
|
||||
}
|
||||
@article{zhang2023instruction,
|
||||
title={Instruction tuning for large language models: A survey},
|
||||
author={Zhang, Shengyu and Dong, Linfeng and Li, Xiaoya and Zhang, Sen and Sun, Xiaofei and Wang, Shuhe and Li, Jiwei and Hu, Runyi and Zhang, Tianwei and Wu, Fei and others},
|
||||
journal={arXiv preprint arXiv:2308.10792},
|
||||
year={2023}
|
||||
}
|
||||
@article{pfeiffer2020adapterfusion,
|
||||
title={Adapterfusion: Non-destructive task composition for transfer learning},
|
||||
author={Pfeiffer, Jonas and Kamath, Aishwarya and R{\"u}ckl{\'e}, Andreas and Cho, Kyunghyun and Gurevych, Iryna},
|
||||
journal={arXiv preprint arXiv:2005.00247},
|
||||
year={2020}
|
||||
}
|
||||
@article{pfeiffer2020adapterhub,
|
||||
title={Adapterhub: A framework for adapting transformers},
|
||||
author={Pfeiffer, Jonas and R{\"u}ckl{\'e}, Andreas and Poth, Clifton and Kamath, Aishwarya and Vuli{\'c}, Ivan and Ruder, Sebastian and Cho, Kyunghyun and Gurevych, Iryna},
|
||||
journal={arXiv preprint arXiv:2007.07779},
|
||||
year={2020}
|
||||
}
|
||||
|
||||
@article{lu2023uniadapter,
|
||||
title={Uniadapter: Unified parameter-efficient transfer learning for cross-modal modeling},
|
||||
author={Lu, Haoyu and Huo, Yuqi and Yang, Guoxing and Lu, Zhiwu and Zhan, Wei and Tomizuka, Masayoshi and Ding, Mingyu},
|
||||
journal={arXiv preprint arXiv:2302.06605},
|
||||
year={2023}
|
||||
}
|
||||
|
||||
@article{fedus2022switch,
|
||||
title={Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity},
|
||||
author={Fedus, William and Zoph, Barret and Shazeer, Noam},
|
||||
journal={Journal of Machine Learning Research},
|
||||
volume={23},
|
||||
number={120},
|
||||
pages={1--39},
|
||||
year={2022}
|
||||
}
|
||||
@article{lepikhin2020gshard,
|
||||
title={Gshard: Scaling giant models with conditional computation and automatic sharding},
|
||||
author={Lepikhin, Dmitry and Lee, HyoukJoong and Xu, Yuanzhong and Chen, Dehao and Firat, Orhan and Huang, Yanping and Krikun, Maxim and Shazeer, Noam and Chen, Zhifeng},
|
||||
journal={arXiv preprint arXiv:2006.16668},
|
||||
year={2020}
|
||||
}
|
||||
@article{luo2024moelora,
|
||||
title={Moelora: Contrastive learning guided mixture of experts on parameter-efficient fine-tuning for large language models},
|
||||
author={Luo, Tongxu and Lei, Jiahe and Lei, Fangyu and Liu, Weihao and He, Shizhu and Zhao, Jun and Liu, Kang},
|
||||
journal={arXiv preprint arXiv:2402.12851},
|
||||
year={2024}
|
||||
}
|
||||
@article{guo2024large,
|
||||
title={Large language model based multi-agents: A survey of progress and challenges},
|
||||
author={Guo, Taicheng and Chen, Xiuying and Wang, Yaqi and Chang, Ruidi and Pei, Shichao and Chawla, Nitesh V and Wiest, Olaf and Zhang, Xiangliang},
|
||||
journal={arXiv preprint arXiv:2402.01680},
|
||||
year={2024}
|
||||
}
|
||||
@article{zhao2023survey,
|
||||
title={A survey of large language models},
|
||||
author={Zhao, Wayne Xin and Zhou, Kun and Li, Junyi and Tang, Tianyi and Wang, Xiaolei and Hou, Yupeng and Min, Yingqian and Zhang, Beichen and Zhang, Junjie and Dong, Zican and others},
|
||||
journal={arXiv preprint arXiv:2303.18223},
|
||||
year={2023}
|
||||
}
|
||||
@article{gao2024higher,
|
||||
title={Higher layers need more lora experts},
|
||||
author={Gao, Chongyang and Chen, Kezhen and Rao, Jinmeng and Sun, Baochen and Liu, Ruibo and Peng, Daiyi and Zhang, Yawen and Guo, Xiaoyuan and Yang, Jie and Subrahmanian, VS},
|
||||
journal={arXiv preprint arXiv:2402.08562},
|
||||
year={2024}
|
||||
}
|
||||
@inproceedings{dou2024loramoe,
|
||||
title={LoRAMoE: Alleviating world knowledge forgetting in large language models via MoE-style plugin},
|
||||
author={Dou, Shihan and Zhou, Enyu and Liu, Yan and Gao, Songyang and Shen, Wei and Xiong, Limao and Zhou, Yuhao and Wang, Xiao and Xi, Zhiheng and Fan, Xiaoran and others},
|
||||
booktitle={Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
|
||||
pages={1932--1945},
|
||||
year={2024}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@article{achiam2023gpt,
|
||||
title={Gpt-4 technical report},
|
||||
author={Achiam, Josh and Adler, Steven and Agarwal, Sandhini and Ahmad, Lama and Akkaya, Ilge and Aleman, Florencia Leoni and Almeida, Diogo and Altenschmidt, Janko and Altman, Sam and Anadkat, Shyamal and others},
|
||||
journal={arXiv preprint arXiv:2303.08774},
|
||||
year={2023}
|
||||
}
|
||||
@article{jaszczur2021sparse,
|
||||
title={Sparse is enough in scaling transformers},
|
||||
author={Jaszczur, Sebastian and Chowdhery, Aakanksha and Mohiuddin, Afroz and Kaiser, Lukasz and Gajewski, Wojciech and Michalewski, Henryk and Kanerva, Jonni},
|
||||
journal={Advances in Neural Information Processing Systems},
|
||||
volume={34},
|
||||
pages={9895--9907},
|
||||
year={2021}
|
||||
}
|
||||
@inproceedings{standley2020tasks,
|
||||
title={Which tasks should be learned together in multi-task learning?},
|
||||
author={Standley, Trevor and Zamir, Amir and Chen, Dawn and Guibas, Leonidas and Malik, Jitendra and Savarese, Silvio},
|
||||
booktitle={International conference on machine learning},
|
||||
pages={9120--9132},
|
||||
year={2020},
|
||||
organization={PMLR}
|
||||
}
|
||||
@article{cai2024survey,
|
||||
title={A survey on mixture of experts},
|
||||
author={Cai, Weilin and Jiang, Juyong and Wang, Fan and Tang, Jing and Kim, Sunghun and Huang, Jiayi},
|
||||
journal={arXiv preprint arXiv:2407.06204},
|
||||
year={2024}
|
||||
}
|
||||
@article{karimi2021compacter,
|
||||
title={Compacter: Efficient low-rank hypercomplex adapter layers},
|
||||
author={Karimi Mahabadi, Rabeeh and Henderson, James and Ruder, Sebastian},
|
||||
journal={Advances in Neural Information Processing Systems},
|
||||
volume={34},
|
||||
pages={1022--1035},
|
||||
year={2021}
|
||||
}
|
||||
@article{bommasani2021opportunities,
|
||||
title={On the opportunities and risks of foundation models},
|
||||
author={Bommasani, Rishi and Hudson, Drew A and Adeli, Ehsan and Altman, Russ and Arora, Simran and von Arx, Sydney and Bernstein, Michael S and Bohg, Jeannette and Bosselut, Antoine and Brunskill, Emma and others},
|
||||
journal={arXiv preprint arXiv:2108.07258},
|
||||
year={2021}
|
||||
}
|
||||
@article{pan2024lisa,
|
||||
title={LISA: Layerwise Importance Sampling for Memory-Efficient Large Language Model Fine-Tuning},
|
||||
author={Pan, Rui and Liu, Xiang and Diao, Shizhe and Pi, Renjie and Zhang, Jipeng and Han, Chi and Zhang, Tong},
|
||||
journal={arXiv preprint arXiv:2403.17919},
|
||||
year={2024}
|
||||
}
|
||||
@article{feng2024mixture,
|
||||
title={Mixture-of-loras: An efficient multitask tuning for large language models},
|
||||
author={Feng, Wenfeng and Hao, Chuzhan and Zhang, Yuewei and Han, Yu and Wang, Hao},
|
||||
journal={arXiv preprint arXiv:2403.03432},
|
||||
year={2024}
|
||||
}
|
||||
@article{lester2021power,
|
||||
title={The power of scale for parameter-efficient prompt tuning},
|
||||
author={Lester, Brian and Al-Rfou, Rami and Constant, Noah},
|
||||
journal={arXiv preprint arXiv:2104.08691},
|
||||
year={2021}
|
||||
}
|
||||
@article{zhou2024lima,
|
||||
title={Lima: Less is more for alignment},
|
||||
author={Zhou, Chunting and Liu, Pengfei and Xu, Puxin and Iyer, Srinivasan and Sun, Jiao and Mao, Yuning and Ma, Xuezhe and Efrat, Avia and Yu, Ping and Yu, Lili and others},
|
||||
journal={Advances in Neural Information Processing Systems},
|
||||
volume={36},
|
||||
year={2024}
|
||||
}
|
||||
@article{wei2021finetuned,
|
||||
title={Finetuned language models are zero-shot learners},
|
||||
author={Wei, Jason and Bosma, Maarten and Zhao, Vincent Y and Guu, Kelvin and Yu, Adams Wei and Lester, Brian and Du, Nan and Dai, Andrew M and Le, Quoc V},
|
||||
journal={arXiv preprint arXiv:2109.01652},
|
||||
year={2021}
|
||||
}
|
||||
|
||||
@article{brynjolfsson2025generative,
|
||||
title={Generative AI at work},
|
||||
author={Brynjolfsson, Erik and Li, Danielle and Raymond, Lindsey},
|
||||
journal={The Quarterly Journal of Economics},
|
||||
pages={qjae044},
|
||||
year={2025},
|
||||
publisher={Oxford University Press}
|
||||
}
|
||||
@Misc{peft,
|
||||
title = {PEFT: State-of-the-art Parameter-Efficient Fine-Tuning methods},
|
||||
author = {Sourab Mangrulkar and Sylvain Gugger and Lysandre Debut and Younes Belkada and Sayak Paul and Benjamin Bossan},
|
||||
howpublished = {\url{https://github.com/huggingface/peft}},
|
||||
year = {2022}
|
||||
}
|
||||
@article{li2023chatdoctor,
|
||||
title={ChatDoctor: A Medical Chat Model Fine-Tuned on a Large Language Model Meta-AI (LLaMA) Using Medical Domain Knowledge},
|
||||
author={Li, Yunxiang and Li, Zihan and Zhang, Kai and Dan, Ruilong and Jiang, Steve and Zhang, You},
|
||||
journal={Cureus},
|
||||
volume={15},
|
||||
number={6},
|
||||
year={2023},
|
||||
publisher={Cureus}
|
||||
}
|
||||
@online{DatabricksBlog2023DollyV2,
|
||||
author = {Mike Conover and Matt Hayes and Ankit Mathur and Jianwei Xie and Jun Wan and Sam Shah and Ali Ghodsi and Patrick Wendell and Matei Zaharia and Reynold Xin},
|
||||
title = {Free Dolly: Introducing the World's First Truly Open Instruction-Tuned LLM},
|
||||
year = {2023},
|
||||
url = {https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm},
|
||||
urldate = {2023-06-30}
|
||||
}
|
||||
@inproceedings{nakano2021webgpt,
|
||||
author = {Reiichiro Nakano and Jacob Hilton and Suchir Balaji and Jeff Wu and Long Ouyang and Christina Kim and Christopher Hesse and Shantanu Jain and Vineet Kosaraju and William Saunders and Xu Jiang and Karl Cobbe and Tyna Eloundou and Gretchen Krueger and Kevin Button and Matthew Knight and Benjamin Chess and John Schulman},
|
||||
title = {WebGPT: Browser-assisted question-answering with human feedback},
|
||||
booktitle = {arXiv},
|
||||
year = 2021,
|
||||
}
|
||||
@inproceedings{zhang2023automatic,
|
||||
title={Automatic Chain of Thought Prompting in Large Language Models},
|
||||
author={Zhang, Zhuosheng and Zhang, Aston and Li, Mu and Smola, Alex},
|
||||
booktitle={The Eleventh International Conference on Learning Representations (ICLR 2023)},
|
||||
year={2023}
|
||||
}
|
||||
@misc{codealpaca,
|
||||
author = {Sahil Chaudhary},
|
||||
title = {Code Alpaca: An Instruction-following LLaMA model for code generation},
|
||||
year = {2023},
|
||||
publisher = {GitHub},
|
||||
journal = {GitHub repository},
|
||||
howpublished = {\url{https://github.com/sahil280114/codealpaca}},
|
||||
}
|
||||
@article{zhao2024hypermoe,
|
||||
title={HyperMoE: Towards Better Mixture of Experts via Transferring Among Experts},
|
||||
author={Zhao, Hao and Qiu, Zihan and Wu, Huijia and Wang, Zili and He, Zhaofeng and Fu, Jie},
|
||||
journal={arXiv preprint arXiv:2402.12656},
|
||||
year={2024}
|
||||
}
|
||||
779
mypaper/IJCAI2026_CASCADE.tex
Normal file
779
mypaper/IJCAI2026_CASCADE.tex
Normal file
@@ -0,0 +1,779 @@
|
||||
\title{Coarse-to-Fine Spectral Cascading for Parameter-Efficient LLM Adaptation}
|
||||
|
||||
\begin{document}
|
||||
|
||||
\maketitle
|
||||
\begin{abstract}
|
||||
Parameter-efficient fine-tuning is widely used for adapting large language models, and recent methods have explored frequency-domain parameterizations as a promising alternative to low-rank update assumptions.
|
||||
However, most existing methods rely on a single structural assumption and treat different frequency components independently, limiting their ability to jointly model global adaptations, localized refinements, and dependencies.
|
||||
To address these limitations, we propose CASCADE, a coarse-to-fine spectral cascading framework for parameter-efficient LLM adaptation. Specifically, CASCADE models weight updates using heterogeneous experts across frequency and spatial domains, and explicitly coordinates global and local updates through cascaded spectral modulation and adaptive routing.
|
||||
This design enables coherent integration of global structural adjustments with localized refinements, resulting in more effective and robust adaptation.
|
||||
Extensive experiments across multiple benchmarks demonstrate that CASCADE consistently outperforms strong PEFT baselines.
|
||||
\end{abstract}
|
||||
|
||||
\section{Introduction}
|
||||
|
||||
|
||||
Large Language Models (LLMs) have demonstrated remarkable capabilities across a wide range of natural language understanding, reasoning, and generation tasks, and have become a fundamental component in various real-world applications.
|
||||
Despite their strong generalization ability, adapting pretrained LLMs to specific downstream tasks typically still requires task-specific fine-tuning.
|
||||
However, full-parameter fine-tuning incurs substantial computational and storage costs, which limits its practicality, particularly when a single model must be adapted to multiple domains.
|
||||
|
||||
To address this challenge, parameter-efficient fine-tuning (PEFT) methods have been extensively studied.
|
||||
Instead of updating all model parameters, PEFT methods restrict adaptation to a small set of trainable parameters while keeping the pretrained backbone frozen.
|
||||
Representative approaches such as Low-Rank Adaptation (LoRA)~\cite{hu2021lora} assume that weight updates lie in a low-dimensional subspace, achieving strong parameter efficiency.
|
||||
More recently, frequency-domain PEFT methods have been proposed, which parameterize weight updates in transformed domains such as Fourier or wavelet bases, exploiting the spectral structure of adaptation patterns~\cite{gao2024parameter,hu2025waveletft}.
|
||||
These methods have demonstrated promising efficiency-performance trade-offs across a variety of tasks.
|
||||
|
||||
\begin{figure}[t]
|
||||
\centering
|
||||
\includegraphics[width=0.9\linewidth]{assets/influence_comparisonv3.pdf}
|
||||
\caption{Spectral characteristics of weight updates under full fine-tuning. High-frequency components dominate the spectral energy of weight updates, whereas low-frequency components, despite their low energy, affect a substantially larger portion of the weight matrix. This pattern is consistent across layers and modules, highlighting the distinct global and local roles of different frequency components.}
|
||||
\label{fig:spectral}
|
||||
\end{figure}
|
||||
|
||||
Despite this progress, existing PEFT methods still face two key limitations:
|
||||
|
||||
\paragraph{Challenge 1: Single-structure limitation.}
|
||||
Most PEFT approaches rely on a single structural constraint, such as restricting weight updates to a low-rank subspace or parameterizing them with a fixed frequency-domain basis.
|
||||
However, a single structure is insufficient to capture the heterogeneous nature of weight updates in LLM adaptation, which typically involves both global semantic or reasoning behavior and localized, fine-grained corrections.
|
||||
\paragraph{Challenge 2: Cross-frequency dependency.}
|
||||
Recent frequency-aware PEFT methods incorporate frequency or scale information into weight updates, but typically treat different frequency components as independent.
|
||||
In practice, effective adaptation requires alignment between local high-frequency refinements and global low-frequency updates. Ignoring such dependencies can lead to suboptimal or inefficient adaptations.
|
||||
|
||||
|
||||
Fig.~\ref{fig:spectral} reveals a notable mismatch between spectral energy and spatial influence in weight updates.
|
||||
High-frequency components dominate the spectral energy, yet their impact is often confined to a limited subset of parameters.
|
||||
In contrast, low-frequency components contribute relatively small spectral energy but influence a substantially larger portion of the weight matrix.
|
||||
The large spatial coverage with low spectral energy corresponds to smooth and coherent changes distributed across many parameters, characteristic of global structural adaptation.
|
||||
Conversely, high spectral energy concentrated on a limited subset of parameters corresponds to sparse and localized modifications.
|
||||
This contrast reveals that low-frequency components establish a global adaptation structure, while high-frequency components refine specific regions on top of this structure, forming a coarse-to-fine adaptation pattern.
|
||||
These observations suggest that effective adaptation requires modeling heterogeneous frequency components with distinct roles and capturing the dependency between global and local updates.
|
||||
|
||||
Motivated by this insight, we propose \textbf{CASCADE} (Coarse-to-Fine Spectral Cascading), a parameter-efficient fine-tuning framework that explicitly models heterogeneous frequency components of weight updates and their dependencies.
|
||||
CASCADE adopts a heterogeneous mixture-of-experts architecture, in which complementary experts are designed to capture different structural roles of weight updates.
|
||||
Specifically, we employ \textit{(i) a low-frequency expert} based on the Discrete Cosine Transform (DCT) to capture global and smooth structural adjustments across the weight matrix; \textit{(ii) a high-frequency expert} operating on wavelet detail subbands to model localized refinements corresponding to fine-grained corrections; and \textit{(iii) a spatial residual expert} in the original parameter space to handle update patterns that are difficult to represent in the frequency domain.
|
||||
|
||||
Crucially, CASCADE goes beyond treating these components as independent. We introduce a cascaded spectral modulation mechanism that establishes an explicit coarse-to-fine dependency, in which low-frequency updates provide a global adaptation structure that conditions the generation of high-frequency refinements.
|
||||
This design enforces alignment between global and local updates, ensuring that localized corrections remain consistent with the overall adaptation direction.
|
||||
In addition, we incorporate a spectral complexity-aware routing mechanism that dynamically weights different experts based on input characteristics, enabling flexible and context-sensitive adaptation.
|
||||
|
||||
Together, these designs enable CASCADE to overcome the limitations of single-structure modeling and independent frequency components in existing PEFT methods, providing a unified framework that coherently captures both global and local updates. Our contributions are summarized as follows:
|
||||
\begin{itemize}[leftmargin=*, topsep=0pt]
|
||||
\item To our knowledge, CASCADE is the first PEFT framework that models LLM weight updates using heterogeneous experts across frequency and spatial domains, enabling a unified representation of global and localized refinements.
|
||||
|
||||
\item We introduce a cascaded spectral modulation mechanism that establishes coarse-to-fine dependencies between low- and high-frequency updates, together with a spectral complexity-aware routing strategy for adaptive expert combination.
|
||||
|
||||
\item Extensive experiments on fifteen public benchmarks, using three backbone models and covering commonsense and arithmetic tasks, demonstrate that CASCADE consistently outperforms existing mainstream PEFT methods, validating its effectiveness across diverse adaptation scenarios.
|
||||
|
||||
\end{itemize}
|
||||
|
||||
|
||||
\begin{figure*}[ht]
|
||||
\centering
|
||||
\includegraphics[width=0.8\linewidth]{assets/model2.pdf}
|
||||
\caption{Overview of CASCADE.
|
||||
CASCADE adapts frozen backbone modules via heterogeneous frequency-domain and spatial-domain experts, coordinated by cascaded modulation and dynamically combined through spectral complexity-aware routing.
|
||||
}
|
||||
\label{fig:framework}
|
||||
\end{figure*}
|
||||
|
||||
\section{Preliminaries}
|
||||
\label{sec:pre}
|
||||
|
||||
In this section, we briefly introduce the key formulations and perspectives that will be used throughout the paper, in order to facilitate the presentation of our method in Section~3.
|
||||
|
||||
\subsection{Parameter-Efficient Fine-Tuning}
|
||||
Parameter-efficient fine-tuning (PEFT) aims to adapt a pretrained model to downstream tasks by learning a small number of trainable parameters, while keeping the original pretrained weights frozen.
|
||||
|
||||
Let $\mathbf{W}_0 \in \mathbb{R}^{m \times n}$ denote a pretrained weight matrix of a linear transformation, and $\mathbf{x} \in \mathbb{R}^n$ be the corresponding input.
|
||||
Instead of directly updating $\mathbf{W}_0$, PEFT methods introduce an additive weight update $\Delta \mathbf{W}$, yielding the adapted transformation:
|
||||
\begin{equation}
|
||||
\mathbf{y} = (\mathbf{W}_0 + \Delta \mathbf{W}) \mathbf{x}.
|
||||
\label{eq:peft}
|
||||
\end{equation}
|
||||
|
||||
The core principle of PEFT is to impose structural constraints on $\Delta \mathbf{W}$ to significantly reduce the adaptation cost.
|
||||
Common constraints include low-rank factorization and sparsity assumptions, while more recent approaches have also explored structured parameterizations in transformed domains.
|
||||
Under this formulation, different PEFT methods can be viewed as imposing distinct structural assumptions on $\Delta \mathbf{W}$, which, in turn, determine the types of update patterns they are capable of representing.
|
||||
In practice, most existing PEFT methods adopt a single structural assumption throughout adaptation.
|
||||
This observation motivates exploring complementary structural assumptions that can capture heterogeneous update patterns from different perspectives.
|
||||
|
||||
\subsection{Frequency-Domain View of Weight Updates}
|
||||
|
||||
Under the formulation in Eq.~\eqref{eq:peft}, the weight update $\Delta \mathbf{W} \in \mathbb{R}^{m \times n}$ can be interpreted as a two-dimensional signal defined over the parameter indices.
|
||||
From this perspective, it is natural to analyze $\Delta \mathbf{W}$ in the frequency domain by applying an appropriate linear transform, which decomposes the update into components associated with different spatial frequencies.
|
||||
|
||||
In general, low-frequency components correspond to smooth, slowly varying patterns spanning large regions of the weight matrix, while high-frequency components capture rapid variations localized to specific parameter regions.
|
||||
These components reflect distinct structural characteristics of weight updates, ranging from global, coherent adjustments to localized, fine-grained modifications.
|
||||
Frequency-domain analysis thus provides a complementary view to spatial-domain parameterizations, emphasizing the scale and distribution of variations rather than explicit locations.
|
||||
|
||||
Different frequency-domain transforms, such as Fourier or wavelet transforms, offer alternative bases for representing $\Delta \mathbf{W}$, each inducing different biases with respect to globality, locality, and multi-scale structure.
|
||||
Representing weight updates in frequency-domain bases allows different structural constraints to be applied to components at different frequencies, enabling finer control over the resulting update patterns.
|
||||
|
||||
Importantly, this frequency-domain perspective does not assume a specific decomposition strategy but provides a unified view for characterizing heterogeneous structures in weight updates.
|
||||
However, existing PEFT methods typically adopt a single structural assumption for adaptation, limiting their ability to jointly capture global and localized updates.
|
||||
|
||||
\section{Method}
|
||||
|
||||
\subsection{Overview}
|
||||
|
||||
Existing PEFT methods typically rely on a single structural assumption and treat different components of weight updates as independent, which limits their ability to model heterogeneous and interdependent adaptation behaviors in LLMs.
|
||||
In practice, effective adaptation often involves both global structural adjustments and localized refinements, which are difficult to capture under independent modeling assumptions.
|
||||
From a spectral perspective, weight updates can be decomposed into components with distinct functional roles.
|
||||
Low-frequency components generally capture smooth, global adjustments that shape the overall adaptation structure, whereas high-frequency components correspond to localized and fine-grained modifications.
|
||||
In addition, these components are inherently coupled, forming a coarse-to-fine adaptation pattern in which local refinements are guided by a global structure.
|
||||
|
||||
|
||||
Motivated by this observation, we propose \textbf{CASCADE} (Coarse-to-Fine Spectral Cascading), a PEFT framework that explicitly models heterogeneous update components together with their coarse-to-fine dependencies.
|
||||
As illustrated in Fig.~\ref{fig:framework}, CASCADE adopts a heterogeneous mixture-of-experts design with a frozen backbone.
|
||||
It introduces three complementary experts:
|
||||
(i) a low-frequency expert operating in the Discrete Cosine Transform (DCT) domain to capture global and smooth updates,
|
||||
(ii) a high-frequency expert that models wavelet detail components to represent localized refinements, and
|
||||
(iii) a spatial residual expert in the original parameter space to handle update patterns that are not well represented in the frequency domain.
|
||||
|
||||
CASCADE further incorporates a cascaded spectral modulation mechanism, in which low-frequency updates condition high-frequency refinements to enforce consistency between global and local adaptations.
|
||||
In addition, a spectral complexity-aware routing module dynamically combines the outputs of different experts based on input characteristics.
|
||||
|
||||
By jointly modeling heterogeneous updates and dependencies, CASCADE provides a unified framework for efficiently capturing both global and local adaptation refinements.
|
||||
|
||||
\subsection{Problem Formulation}
|
||||
|
||||
Under the standard PEFT setting introduced in Section~\ref{sec:pre}, CASCADE adapts a frozen pretrained weight matrix $\mathbf{W}_0$ by learning a structured weight update.
|
||||
Specifically, CASCADE represents the update as a combination of $E$ complementary experts, each producing a structured update $\Delta \mathbf{W}_e$ that captures a distinct type of adaptation pattern.
|
||||
Given an input $\mathbf{x}$, the adapted output is obtained by aggregating expert-specific updates using input-dependent routing weights:
|
||||
\begin{equation}
|
||||
\mathbf{y} = \mathbf{W}_0 \mathbf{x} + \sum_{e=1}^{E} w_e(\mathbf{x}) \cdot \Delta \mathbf{W}_e \mathbf{x},
|
||||
\label{eq:cascade_formulation}
|
||||
\end{equation}
|
||||
where $w_e(\mathbf{x})$ denotes the routing weight assigned to the $e$-th expert.
|
||||
The specific parameterizations of $\Delta \mathbf{W}_e$, the mechanisms for modeling inter-expert dependencies, and the routing strategy are described in the following subsections.
|
||||
|
||||
\subsection{Heterogeneous Domain Experts}
|
||||
As discussed, weight updates in LLM adaptation exhibit heterogeneous structural characteristics.
|
||||
To capture this heterogeneity, CASCADE introduces domain-specific experts that impose distinct inductive biases through different parameterizations of $\Delta \mathbf{W}_e$.
|
||||
This design enables complementary modeling of diverse update patterns, alleviating the limitations imposed by a single structural assumption.
|
||||
|
||||
|
||||
\subsubsection{Low-Frequency Expert via Discrete Cosine Transform}
|
||||
The low-frequency expert is designed to capture global and smooth update patterns that span large regions of the weight matrix.
|
||||
Such patterns commonly arise from semantic alignment or global reasoning adjustments and are inefficient to represent using localized or sparse parameterizations.
|
||||
|
||||
To introduce a global smoothness prior, we parameterize the update in the Discrete Cosine Transform (DCT) domain.
|
||||
Let $\mathbf{S}_{\text{dct}} \in \mathbb{R}^{m \times n}$ denote a DCT-domain coefficient matrix, with the same dimensions as the corresponding weight matrix.
|
||||
We restrict learning to a predefined low-frequency index set $\mathcal{I}_{\text{dct}}$ and define the sparse spectral parameterization as
|
||||
\begin{equation}
|
||||
\mathbf{S}_{\text{dct}}[i,j] =
|
||||
\begin{cases}
|
||||
s_k, & (i,j) \in \mathcal{I}_{\text{dct}}, \\
|
||||
0, & \text{otherwise},
|
||||
\end{cases}
|
||||
\label{eq:dct_sparse}
|
||||
\end{equation}
|
||||
where $\{s_k\}_{k=1}^{K_{\text{dct}}}$ are trainable parameters associated with fixed low-frequency locations $(i_k,j_k)\in\mathcal{I}_{\text{dct}}$.
|
||||
The index set $\mathcal{I}_{\text{dct}}$ is obtained by selecting the $K_{\text{dct}}$ locations with the smallest Manhattan distance to the zero-frequency index (0, 0), thereby favoring slowly varying spatial patterns.
|
||||
|
||||
|
||||
The corresponding spatial-domain update produced by the low-frequency expert is obtained via the inverse DCT:
|
||||
\begin{equation}
|
||||
\Delta \mathbf{W}_{\text{dct}} = \mathrm{IDCT}(\mathbf{S}_{\text{dct}}).
|
||||
\label{eq:dct_inverse}
|
||||
\end{equation}
|
||||
|
||||
By restricting learning to low-frequency coefficients, this expert enforces a global smoothness prior on $\Delta \mathbf{W}_{\text{dct}}$, enabling efficient modeling of large-scale structural adjustments with a compact parameterization.
|
||||
As such, it serves as the global backbone of the adaptation.
|
||||
|
||||
\subsubsection{High-Frequency Expert via Wavelet Details}
|
||||
|
||||
|
||||
While the low-frequency expert captures global structure, effective adaptation also requires localized, fine-grained high-frequency corrections that global frequency fails to capture effectively.
|
||||
To model such patterns, the high-frequency expert parameterizes updates in the wavelet domain, which provides localization in both spatial and frequency domains.
|
||||
|
||||
We adopt a single-level two-dimensional Haar wavelet basis, which defines four wavelet subbands: one low-frequency approximation subband ($\mathbf{LL}$) and three detail subbands ($\mathbf{LH}$, $\mathbf{HL}$, and $\mathbf{HH}$) corresponding to high-frequency components along different spatial directions.
|
||||
To focus on localized refinements, we discard the approximation component and parameterize only the detail subbands.
|
||||
|
||||
Let $\mathcal{B}=\{\mathrm{LH},\mathrm{HL},\mathrm{HH}\}$ denote the set of detail subbands.
|
||||
For each $b\in\mathcal{B}$, we learn a sparse coefficient matrix $\mathbf{B}_b$ defined on a fixed index set $\mathcal{I}_b$, which is randomly sampled once and kept constant during training:
|
||||
\begin{equation}
|
||||
\mathbf{B}_b[i,j] =
|
||||
\begin{cases}
|
||||
s^{(b)}_k, & (i,j)\in\mathcal{I}_b, \\
|
||||
0, & \text{otherwise}.
|
||||
\end{cases}
|
||||
\label{eq:wavelet_sparse}
|
||||
\end{equation}
|
||||
|
||||
The spatial-domain update is reconstructed via the inverse Haar transform from the detail subband coefficients:
|
||||
\begin{equation}
|
||||
\Delta \mathbf{W}_{\text{wav}} =
|
||||
\mathrm{IHaar}\!\left(
|
||||
\mathbf{0},\,
|
||||
\mathbf{B}_{\mathrm{LH}},\,
|
||||
\mathbf{B}_{\mathrm{HL}},\,
|
||||
\mathbf{B}_{\mathrm{HH}}
|
||||
\right).
|
||||
\label{eq:wavelet_inverse}
|
||||
\end{equation}
|
||||
|
||||
By restricting learning to sparse detail coefficients, the wavelet expert provides a dedicated mechanism for fine-grained corrections that complements the global updates modeled in the DCT expert, naturally motivating explicit coarse-to-fine coordination across frequency components.
|
||||
|
||||
\subsubsection{Spatial Residual Expert}
|
||||
|
||||
Although frequency-domain parameterizations impose useful structural priors, they may fail to capture certain irregular update patterns that are not well represented by predefined spectral bases.
|
||||
To account for such out-of-basis effects, CASCADE includes a lightweight spatial residual expert that directly operates in the original parameter space by parameterizing a residual update using a low-rank factorization:
|
||||
\begin{equation}
|
||||
\Delta \mathbf{W}_{\text{spatial}} = \mathbf{B}\mathbf{A},
|
||||
\label{eq:spatial_update}
|
||||
\end{equation}
|
||||
where $\mathbf{A}\in\mathbb{R}^{r\times n}$ and $\mathbf{B}\in\mathbb{R}^{m\times r}$ with a small rank $r$.
|
||||
This formulation provides flexible capacity for modeling update patterns that are difficult to express in the frequency domain.
|
||||
|
||||
The spatial expert serves as a residual component for out-of-basis corrections, allowing frequency-domain experts to focus on structured global and local patterns while improving robustness and expressive completeness.
|
||||
|
||||
|
||||
\subsection{Cascaded Spectral Modulation}
|
||||
|
||||
The heterogeneous experts introduced above capture complementary aspects of weight updates.
|
||||
However, treating global and local updates as independent components ignores their inherent dependency, as localized refinements in practice are often guided by a global structure.
|
||||
To explicitly model this coarse-to-fine relationship, CASCADE introduces a cascaded spectral modulation mechanism that enforces consistency between low-frequency structure and high-frequency updates.
|
||||
Specifically, we construct a fixed-dimensional conditioning vector $\mathbf{z}$ by flattening the learned low-frequency DCT coefficients. This vector summarizes the global adaptation pattern and is used as the input to a conditioning network:
|
||||
|
||||
\begin{equation}
|
||||
(\gamma_b, \beta_b)_{b\in\mathcal{B}} = g(\mathbf{z}),
|
||||
\label{eq:film_params}
|
||||
\end{equation}
|
||||
where $g(\cdot)$ denotes a lightweight multilayer perceptron that outputs band-wise scalar modulation parameters, and $\mathcal{B}=\{\mathrm{LH},\mathrm{HL},\mathrm{HH}\}$ indexes the wavelet detail subbands, to which the modulation is applied:
|
||||
\begin{equation}
|
||||
\tilde{\mathbf{B}}_b = (1 + \gamma_b)\,\mathbf{B}_b + \beta_b,
|
||||
\quad b\in\mathcal{B},
|
||||
\label{eq:bandwise_film}
|
||||
\end{equation}
|
||||
where $\gamma_b$ and $\beta_b$ are scalar parameters shared across all locations within subband $b$. The modulation is applied only to the sampled coefficient locations in $\mathcal{I}_b$, with all other entries remaining zero, and the resulting coefficients are used to reconstruct the high-frequency update via Eq.~\eqref{eq:wavelet_inverse}.
|
||||
|
||||
This design establishes an explicit coarse-to-fine dependency, allowing global low-frequency structure to guide localized refinements and yield more coherent weight updates.
|
||||
|
||||
|
||||
\subsection{Spectral Complexity-Aware Routing}
|
||||
|
||||
While cascaded spectral modulation defines how different update components are coupled, the relative importance of these components can vary across inputs.
|
||||
Some inputs primarily require global structural adaptation, while others benefit more from localized or residual corrections.
|
||||
To account for this variability, CASCADE employs a spectral complexity-aware routing mechanism that dynamically combines expert outputs based on input characteristics.
|
||||
|
||||
Given the input activation to a linear layer, we obtain a sequence-level representation $\bar{\mathbf{x}}$ via pooling.
|
||||
From this representation, we extract two complementary types of routing features.
|
||||
First, lightweight spectral statistics are computed to characterize the degree of variation and oscillation in the input, forming a spectral feature vector $\bar{\mathbf{x}}_{\text{spec}}$.
|
||||
Second, a semantic feature is obtained through a learnable linear projection of $\bar{\mathbf{x}}$ to provide higher-level contextual information.
|
||||
The two complementary features are fused through linear projections:
|
||||
\begin{equation}
|
||||
\mathbf{h} = \mathbf{W}_{\text{spec}} \bar{\mathbf{x}}_{\text{spec}} + \mathbf{W}_{\text{sem}} \bar{\mathbf{x}},
|
||||
\label{eq:feature_fusion}
|
||||
\end{equation}
|
||||
and mapped to expert weights via a softmax:
|
||||
\begin{equation}
|
||||
\mathbf{w} = \mathrm{softmax}(\mathbf{W}_{\text{out}} \mathbf{h}),
|
||||
\label{eq:routing_weights}
|
||||
\end{equation}
|
||||
where $\mathbf{w}\in\mathbb{R}^{E}$ assigns a non-negative weight to each expert.
|
||||
|
||||
By leveraging coarse spectral cues and semantic context, the routing mechanism adaptively weights expert contributions while preserving efficient and stable soft combination.
|
||||
|
||||
\subsection{Training Details}
|
||||
|
||||
CASCADE is trained end-to-end under the standard supervised objective for the downstream task, while keeping the backbone frozen.
|
||||
The overall training objective consists of the task loss and two auxiliary regularization terms:
|
||||
\begin{equation}
|
||||
\mathcal{L}
|
||||
=
|
||||
\mathcal{L}_{\text{task}}
|
||||
+
|
||||
\lambda_{\text{bal}} \mathcal{L}_{\text{bal}}
|
||||
+
|
||||
\lambda_{\text{orth}} \mathcal{L}_{\text{orth}},
|
||||
\label{eq:training_objective}
|
||||
\end{equation}
|
||||
where $\lambda_{\text{bal}}$ and $\lambda_{\text{orth}}$ control the strength of the regularizers.
|
||||
|
||||
\paragraph{Routing Regularization.}
|
||||
To prevent degenerate routing solutions, we introduce a load-balancing regularization, which is defined as
|
||||
\begin{equation}
|
||||
\mathcal{L}_{\text{bal}}
|
||||
=
|
||||
E \sum_{e=1}^{E}
|
||||
\left(
|
||||
\frac{1}{B} \sum_{b=1}^{B} w_e^{(b)}
|
||||
\right)^2,
|
||||
\label{eq:load_balance}
|
||||
\end{equation}
|
||||
where $w_e^{(b)}$ denotes the routing weight of expert $e$ for the $b$-th sample, and $E$ is the number of experts.
|
||||
|
||||
\paragraph{Spectral Orthogonality.}
|
||||
To reduce redundancy between frequency-domain experts, we impose an orthogonality regularization on their spectral parameters.
|
||||
Specifically, we penalize the inner product between the low-frequency spectral coefficients and high-frequency wavelet detail coefficients.
|
||||
Both spectral representations are first mapped to a common latent space with matched dimensionality.
|
||||
\begin{equation}
|
||||
\mathcal{L}_{\text{orth}}
|
||||
=
|
||||
\left|
|
||||
\left\langle
|
||||
\mathrm{vec}(\mathbf{S}_{\text{dct}}),
|
||||
\mathrm{vec}([\mathbf{B}_{\mathrm{LH}}, \mathbf{B}_{\mathrm{HL}}, \mathbf{B}_{\mathrm{HH}}])
|
||||
\right\rangle
|
||||
\right|.
|
||||
\label{eq:orth_loss}
|
||||
\end{equation}
|
||||
This regularization encourages the two experts to capture complementary spectral patterns.
|
||||
|
||||
|
||||
Algorithm~\ref{alg:cascade} summarizes the overall procedure of CASCADE, including expert-specific update construction, cascaded spectral modulation, and expert routing.
|
||||
|
||||
\begin{algorithm}[t]
|
||||
\caption{CASCADE: Coarse-to-Fine Spectral Cascading}
|
||||
\label{alg:cascade}
|
||||
\KwIn{Input activation $\mathbf{x}$, frozen weight matrix $\mathbf{W}_0$}
|
||||
\KwOut{Adapted output $\mathbf{y}$}
|
||||
|
||||
Compute base output $\mathbf{y}_0 \leftarrow \mathbf{W}_0 \mathbf{x}$ \\
|
||||
|
||||
\textbf{Low-frequency expert:} \\
|
||||
Construct sparse DCT spectrum $\mathbf{S}_{\text{dct}}$ using Eq.~\eqref{eq:dct_sparse} \\
|
||||
Reconstruct global update $\Delta \mathbf{W}_{\text{dct}}$ using Eq.~\eqref{eq:dct_inverse} \\
|
||||
|
||||
\textbf{High-frequency expert:} \\
|
||||
Construct sparse wavelet detail coefficients $\{\mathbf{B}_b\}_{b\in\mathcal{B}}$ using Eq.~\eqref{eq:wavelet_sparse} \\
|
||||
Compute modulation parameters $(\gamma_b,\beta_b)_{b\in\mathcal{B}}$ using Eq.~\eqref{eq:film_params} \\
|
||||
Apply band-wise modulation $\tilde{\mathbf{B}}_b$ using Eq.~\eqref{eq:bandwise_film} \\
|
||||
Reconstruct local update $\Delta \mathbf{W}_{\text{wav}}$ using Eq.~\eqref{eq:wavelet_inverse} \\
|
||||
|
||||
\textbf{Spatial residual expert:} \\
|
||||
Compute residual update $\Delta \mathbf{W}_{\text{spatial}}$ using Eq.~\eqref{eq:spatial_update} \\
|
||||
|
||||
\textbf{Routing and aggregation:} \\
|
||||
Compute expert weights $\mathbf{w}$ using Eq.~\eqref{eq:routing_weights} \\
|
||||
Compute aggregated update $\Delta \mathbf{W} \leftarrow \sum_{e=1}^{E} w_e \cdot \Delta \mathbf{W}_e$ \\
|
||||
Return $\mathbf{y} \leftarrow \mathbf{y}_0 + \Delta \mathbf{W}\mathbf{x}$
|
||||
\end{algorithm}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
\begin{table*}[t]
|
||||
\centering
|
||||
\small
|
||||
\caption{Comparison of CASCADE and baselines on Commonsense tasks across three backbones, reported in accuracy (\%), with micro-avg denoting the average performance.
|
||||
$\ ^{*}$ indicates statistically significant improvements over the best baseline (two-sided t-test, $p<0.05$).}
|
||||
\resizebox{1\linewidth}{!}{
|
||||
\renewcommand{\arraystretch}{1.05}
|
||||
\begin{tabular}{l|lccccccccc}
|
||||
\toprule
|
||||
\textbf{Backbone LLM} & \textbf{Method}
|
||||
& \textbf{BoolQ} & \textbf{PIQA} & \textbf{SIQA}
|
||||
& \textbf{ARC-C} & \textbf{ARC-E} & \textbf{OBQA}
|
||||
& \textbf{HellaSwag} & \textbf{WinoGrande}
|
||||
& \textbf{micro-avg(\%)$\uparrow$} \\
|
||||
\midrule
|
||||
\multirow{7}{*}{\textbf{Qwen 3 4B}}
|
||||
& LoRA &66.88&82.97&\underline{73.59}&86.86&92.21&\underline{83.60}&85.37&\underline{68.75}&81.27\\
|
||||
& AdaLoRA &\underline{67.34}&82.64&73.44&87.03&92.89&82.00&79.99&67.88&78.89\\
|
||||
& BONE &66.15&81.61&72.62&85.24&92.55&75.40&78.85&68.11&77.78\\
|
||||
& FourierFT &66.57&80.30&73.54&86.01&92.09&82.40&79.59&63.14&78.01\\
|
||||
& LoCA &66.85&83.03&72.67&86.95&\underline{93.27}&80.60&84.33&66.69&80.66\\
|
||||
& FlyLoRA &66.51&\underline{83.35}&73.54&\underline{87.20}&93.06&78.20&\underline{85.63}&68.35&\underline{81.33}\\
|
||||
& \framework (ours)
|
||||
&\textbf{67.74}&\textbf{83.46}&\textbf{75.49}&\textbf{87.88}
|
||||
&\textbf{93.64}&\textbf{86.40}&\textbf{85.75}&\textbf{71.98}
|
||||
&\textbf{82.22}$^{*}$\\
|
||||
\midrule
|
||||
\multirow{7}{*}{\textbf{LLaMA 3.2 3B}}
|
||||
& LoRA &61.41&78.62&66.79&68.26&84.05&70.20&79.49&\underline{56.35}&\underline{74.05}\\
|
||||
& AdaLoRA &\underline{61.53}&78.89&67.04&\underline{69.71}&83.63&69.60&79.31&54.78&73.96\\
|
||||
& BONE &60.61&76.17&66.53&67.24&79.88&63.20&79.28&50.04&72.61\\
|
||||
& FourierFT &60.92&\underline{80.30}&59.47&67.75&82.45&66.40&79.05&50.67&72.68\\
|
||||
& LoCA &61.07&78.51&64.12&66.47&82.37&67.20&77.07&55.88&72.31\\
|
||||
& FlyLoRA &59.02&78.94&\underline{67.14}&67.58&\underline{84.22}&\underline{71.80}&\underline{79.66}&52.49&73.64\\
|
||||
& \framework (ours)
|
||||
&\textbf{62.66}&\textbf{80.69}&\textbf{67.40}&\textbf{69.97}
|
||||
&\textbf{84.68}&\textbf{73.60}&\textbf{79.94}&\textbf{62.59}
|
||||
&\textbf{75.25}$^{*}$\\
|
||||
\midrule
|
||||
\multirow{7}{*}{\textbf{Gemma 3 4B}}
|
||||
& LoRA &64.34&78.07&\underline{70.21}&75.26&\underline{87.37}&75.60&\underline{77.97}&\underline{61.88}&\underline{75.21}\\
|
||||
& AdaLoRA &\underline{64.86}&\underline{79.16}&69.91&75.68&86.87&72.00&77.19&61.17&74.84\\
|
||||
& BONE &63.67&78.35&69.19&\underline{76.11}&86.95&70.60&73.97&48.22&72.37\\
|
||||
& FourierFT &64.22&77.42&68.68&74.32&87.33&72.00&74.49&50.75&72.68\\
|
||||
& LoCA &63.52&76.82&68.47&73.29&85.98&68.20&75.06&49.01&72.39\\
|
||||
& FlyLoRA &61.59&76.12&67.45&75.34&86.53&\underline{77.60}&77.88&58.72&74.15\\
|
||||
& \framework (ours)
|
||||
&\textbf{65.81}&\textbf{80.36}&\textbf{73.39}&\textbf{77.39}
|
||||
&\textbf{88.97}&\textbf{79.00}&\textbf{78.47}&\textbf{64.09}
|
||||
&\textbf{76.59}$^{*}$\\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
}
|
||||
\label{tab:main_common}
|
||||
\vspace{-4px}
|
||||
\end{table*}
|
||||
|
||||
\begin{table}[t]
|
||||
\centering
|
||||
\small
|
||||
\caption{Average Commonsense QA accuracy across Qwen-3 model scales, comparing CASCADE with best PEFT baselines.}
|
||||
\resizebox{1\linewidth}{!}{
|
||||
\renewcommand{\arraystretch}{1}
|
||||
\begin{tabular}{lccc}
|
||||
\toprule
|
||||
\textbf{Baseline} & \textbf{Qwen 3 0.6B} & \textbf{Qwen 3 1.7B} & \textbf{Qwen 3 4B} \\
|
||||
\midrule
|
||||
LoRA &\underline{57.50}&\underline{66.25}&81.27 \\
|
||||
AdaLoRA &56.50&64.37&78.89 \\
|
||||
FlyLoRA &54.37&62.12&\underline{81.33} \\
|
||||
\framework (ours)
|
||||
&\textbf{58.07}&\textbf{66.75}&\textbf{82.22} \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
}
|
||||
\label{tab:scale}
|
||||
\vspace{-9px}
|
||||
\end{table}
|
||||
|
||||
\section{Experiments}
|
||||
To comprehensively evaluate the performance of our proposed CASCADE, we conduct extensive experiments guided by the following key research questions (RQs):
|
||||
|
||||
\begin{itemize}[leftmargin=*]
|
||||
\item \textbf{RQ1:}
|
||||
How does CASCADE compare with representative PEFT baselines across commonsense and arithmetic tasks?
|
||||
|
||||
\item \textbf{RQ2:}
|
||||
How does CASCADE scale across different parameter sizes within the same LLM family?
|
||||
|
||||
\item \textbf{RQ3:}
|
||||
How do individual design components contribute to the performance of CASCADE?
|
||||
\item \textbf{RQ4:}
|
||||
How do contributions from different frequency experts vary across layers under the routing mechanism?
|
||||
\end{itemize}
|
||||
|
||||
|
||||
We first introduce the experimental setup and then systematically address each of the above research questions.
|
||||
|
||||
|
||||
\subsection{Experimental Setup}
|
||||
\paragraph{Datasets.}
|
||||
Following the setup of LLM-Adapters~\cite{hu2023llm}, we evaluate CASCADE on \textit{Commonsense} and \textit{Arithmetic QA} tasks, using the \textit{Commonsense15K} and \textit{Math10K} datasets constructed from multiple data sources.
|
||||
Commonsense performance is evaluated on eight benchmarks: BoolQ~\cite{clark2019boolq}, PIQA~\cite{bisk2020piqa}, SIQA~\cite{sap2019socialiqa}, ARC-Easy\&Challenge~\cite{clark2018think}, OBQA~\cite{mihaylov2018can}, HellaSwag~\cite{zellers2019hellaswag}, and WinoGrande~\cite{sakaguchi2020winogrande},
|
||||
while Arithmetic performance is assessed on seven benchmarks: MultiArith~\cite{roy2016solving}, GSM8K~\cite{cobbe2021training}, AddSub~\cite{hosseini2014learning}, AQuA~\cite{ling2017program}, SingleEq~\cite{koncel2015parsing}, SVAMP~\cite{patel2021nlp}, and MAWPS~\cite{koncel2016mawps}.
|
||||
Accuracy is reported as the evaluation metric, with additional details provided in the Appendix.
|
||||
|
||||
\paragraph{Backbone Models.}
|
||||
We evaluate our method on three representative pre-trained LLM backbones: \textbf{Qwen3}~\cite{qwen3technicalreport}, \textbf{Gemma 3}~\cite{gemma_2025}, and \textbf{LLaMA 3.2}~\cite{grattafiori2024llama}.
|
||||
These models span diverse architectures, enabling a comprehensive evaluation.
|
||||
\paragraph{Baseline Methods.}
|
||||
We compare our method with a diverse set of PEFT approaches spanning \textbf{low-rank adaptation} (LoRA~\cite{hu2021lora}, AdaLoRA~\cite{zhang2023adalora}, BONE~\cite{kang2024balancing}), \textbf{frequency-domain modeling} (FourierFT~\cite{gao2024parameter}, LoCA~\cite{du2025loca}), and \textbf{MoE-based} designs (FlyLoRA~\cite{zou2025flylora}). All methods are implemented following their original settings.
|
||||
|
||||
|
||||
\paragraph{Implementation Details.}
|
||||
All experiments are conducted on NVIDIA GeForce RTX 3090, using bfloat16 with DeepSpeed for efficient training.
|
||||
Key hyperparameters of CASCADE include 20K low-frequency DCT coefficients, 10K wavelet coefficients, a spatial residual expert with rank 48, and load-balancing as well as orthogonality loss weights set to 0.01.
|
||||
For detailed implementation, please refer to the Appendix and our code for reproducibility\footnote{\codelink}. %
|
||||
|
||||
|
||||
\begin{table*}[t]
|
||||
\centering
|
||||
\small
|
||||
\caption{Comparison of CASCADE and representative PEFT baselines on arithmetic reasoning benchmarks with the Qwen3-4B model, reported in accuracy (\%).
|
||||
$\ ^{*}$ indicates statistically significant improvements over the best baseline (two-sided t-test, $p<0.05$).}
|
||||
\resizebox{0.94\linewidth}{!}{
|
||||
\renewcommand{\arraystretch}{0.95}
|
||||
\begin{tabular}{lcccccccc}
|
||||
\toprule
|
||||
\textbf{Baseline}
|
||||
& \textbf{MultiArith} & \textbf{GSM8K} & \textbf{AddSub}
|
||||
& \textbf{AQuA} & \textbf{SingleEq}
|
||||
& \textbf{SVAMP} & \textbf{MAWPS}
|
||||
& \textbf{micro-avg(\%)$\uparrow$} \\
|
||||
\midrule
|
||||
LoRA &\underline{77.50}&\underline{36.16}&\underline{83.80}&26.77&85.83&55.90&\underline{79.41}&\underline{58.53}\\
|
||||
AdaLoRA &\underline{80.50}&33.81&75.95&22.83&74.41&48.80&74.37&54.01\\
|
||||
BONE &79.50&31.69&78.99&\underline{27.17}&80.71&50.30&76.05&54.94\\
|
||||
FourierFT &68.67&31.08&76.46&23.62&78.54&\underline{57.30}&74.34&54.02\\
|
||||
LoCA &73.33&30.63&72.15&21.65&75.98&48.30&69.33&51.41\\
|
||||
FlyLoRA &79.67&35.33&81.52&22.83&\underline{86.42}&56.20&73.11&57.93\\
|
||||
\framework (ours)
|
||||
&\textbf{81.33}&\textbf{37.00}&\textbf{86.08}&\textbf{27.56}
|
||||
&\textbf{87.60}&\textbf{57.90}&\textbf{80.25}&\textbf{60.29}$^{*}$\\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
}
|
||||
\label{tab:main_arith}
|
||||
\vspace{-4px}
|
||||
\end{table*}
|
||||
|
||||
\subsection{Overall Performance (RQ1)}
|
||||
To answer RQ1, we compare CASCADE with baselines on two categories of tasks: commonsense and arithmetic QA.
|
||||
|
||||
As shown in Table~\ref{tab:main_common}, CASCADE consistently achieves the best micro-averaged accuracy across all three backbone models.
|
||||
Compared with strong baselines such as LoRA, AdaLoRA, and recent frequency-domain methods, CASCADE yields consistent and statistically significant improvements, demonstrating robust performance across different architectures and commonsense benchmarks.
|
||||
These results indicate that jointly modeling heterogeneous update components is more effective than relying on a single structural assumption.
|
||||
By capturing both global low-frequency structure and localized high-frequency refinements, CASCADE better adapts to diverse commonsense reasoning patterns.
|
||||
|
||||
We further evaluate CASCADE on arithmetic reasoning benchmarks using the Qwen3-4B backbone, with results reported in Table~\ref{tab:main_arith}.
|
||||
Consistent with the observations on commonsense tasks, CASCADE achieves the highest overall performance, outperforming all baselines in terms of micro-averaged accuracy.
|
||||
Together, these results demonstrate that CASCADE provides a more effective PEFT strategy across both commonsense and arithmetic reasoning tasks.
|
||||
|
||||
\subsection{Scalability Analysis (RQ2)}
|
||||
Table~\ref{tab:scale} reports the performance of CASCADE across different parameter scales within the Qwen-3 family.
|
||||
CASCADE consistently outperforms the strongest PEFT baselines at all model sizes, from 0.6B to 4B parameters.
|
||||
Notably, the performance advantage remains stable as model scale increases, indicating that CASCADE scales favorably with model capacity.
|
||||
This trend validates the effectiveness of CASCADE's design, demonstrating that explicitly modeling heterogeneous update components and their coarse-to-fine coordination remains robust across different model scales within the same LLM family.
|
||||
|
||||
|
||||
|
||||
\subsection{Ablation and Analysis (RQ3, 4)}
|
||||
Fig.~\ref{fig:abla} (left) reports ablation results by removing key components of CASCADE.
|
||||
Removing either the DCT or Wavelet expert leads to clear performance degradation, indicating that both global and local update modeling are necessary.
|
||||
Disabling cascaded spectral modulation further reduces accuracy, highlighting the importance of explicitly modeling coarse-to-fine dependencies rather than combining experts independently.
|
||||
In addition, the spatial residual expert provides consistent gains by compensating for update patterns not well captured in the frequency domain.
|
||||
|
||||
Fig.~\ref{fig:abla} (right) visualizes the routing weights across layers for different experts.
|
||||
Lower layers allocate more weight to the low-frequency (DCT) expert, reflecting a preference for global structural adaptation.
|
||||
As depth increases, the routing gradually shifts toward high-frequency (Wavelet), indicating an increased emphasis on localized and fine-grained refinements.
|
||||
This layer-wise trend is consistent with the intended coarse-to-fine adaptation behavior of CASCADE.
|
||||
|
||||
\begin{figure}[t]
|
||||
\centering
|
||||
\resizebox{1\linewidth}{!}{%
|
||||
\begin{minipage}{\linewidth}
|
||||
\centering
|
||||
\begin{subfigure}[b]{0.495\linewidth}
|
||||
\includegraphics[width=\linewidth]{assets/ablation_main.pdf}
|
||||
\end{subfigure}
|
||||
\hfill
|
||||
\begin{subfigure}[b]{0.495\linewidth}
|
||||
\includegraphics[width=\linewidth]{assets/router_weights_by_layer.pdf}
|
||||
\end{subfigure}
|
||||
\end{minipage}
|
||||
}
|
||||
\caption{Ablation and routing behavior analysis of CASCADE.}
|
||||
\label{fig:abla}
|
||||
\vspace{-10px}
|
||||
\end{figure}
|
||||
|
||||
\section{Related Work}
|
||||
|
||||
\paragraph{Parameter-Efficient Fine-Tuning.}
|
||||
Parameter-efficient fine-tuning (PEFT) adapts large pretrained models by introducing a small number of task-specific parameters while keeping the backbone frozen.~\cite{lialin2023scaling}
|
||||
Representative approaches include adapter-based methods~\cite{pfeiffer2020adapterhub}, prefix tuning~\cite{li2021prefix}, and low-rank adaptation (LoRA)~\cite{hu2021lora}, which models weight updates under a low-rank assumption.
|
||||
Subsequent variants improve flexibility via adaptive rank allocation (e.g., AdaLoRA~\cite{zhang2023adalora}), balancing update magnitude and direction (BONE~\cite{kang2024balancing}), or exploring alternative structured parameterizations such as frequency-domain representations (FourierFT~\cite{gao2024parameter}).
|
||||
More recently, expert-based PEFT methods incorporate routing or mixtures of multiple adaptation modules to improve task decoupling and specialization (e.g., FlyLoRA~\cite{zou2025flylora}, MoELoRA~\cite{luo2024moelora}).
|
||||
Despite their effectiveness, most PEFT methods still rely on a single dominant structural hypothesis for weight updates, which limits their ability to capture heterogeneous adaptation patterns that involve both global and localized refinements.
|
||||
|
||||
\paragraph{Frequency-Domain and Structured Adaptation.}
|
||||
Beyond low-rank factorization, recent work explores parameterizing weight updates in transformed domains~\cite{zhang2025f}.
|
||||
Methods like FourierFT represent weight updates in the Fourier domain using global frequency components~\cite{gao2024parameter,shen2024parameter}.
|
||||
Wavelet-based approaches adopt multi-resolution representations to capture both global structure and localized variations, and LoCA further incorporates location-aware parameterization on cosine representations to model structured, position-sensitive updates~\cite{hu2025waveletft,du2025loca}.
|
||||
By associating low-frequency components with smooth global structure and high-frequency components with localized variations, these methods offer an alternative spectral perspective on adaptation.
|
||||
However, most frequency-domain approaches adopt a single transform or scale and treat frequencies independently, failing to model coarse-to-fine interactions and to coordinate global and local refinements.
|
||||
|
||||
\section{Conclusion}
|
||||
In this paper, we presented CASCADE, a PEFT framework that models LLM weight updates through heterogeneous experts across frequency and spatial domains.
|
||||
By explicitly decomposing weight updates into global low-frequency structures, localized high-frequency refinements, and residual spatial corrections, CASCADE provides a unified and expressive representation of diverse adaptation behaviors.
|
||||
A key contribution of CASCADE is the cascaded spectral modulation mechanism, which establishes an explicit coarse-to-fine dependency between global and local updates, thereby improving the coherence and consistency of the adaptation process.
|
||||
In addition, the spectral complexity-aware routing mechanism enables adaptive expert combination.
|
||||
Extensive experiments across multiple backbone models, tasks, and model scales demonstrate that CASCADE consistently outperforms existing PEFT methods.
|
||||
These results show that explicitly modeling heterogeneous update structures and their dependencies is effective and robust for LLM adaptations.
|
||||
|
||||
\appendix
|
||||
|
||||
\section{Experimental Details}
|
||||
\label{sec:appendix}
|
||||
|
||||
\subsection{Training Setup}
|
||||
All experiments are conducted on NVIDIA GeForce RTX 3090.
|
||||
We employ DeepSpeed with ZeRO Stage~2 optimization for memory-efficient training.
|
||||
The training configuration is kept consistent across all methods to ensure fair comparison.
|
||||
Specifically, we use a per-device batch size of 2 with 2 gradient accumulation steps,
|
||||
resulting in an effective batch size of 4.
|
||||
The learning rate is set to $1 \times 10^{-4}$ with a cosine learning rate scheduler
|
||||
and a warmup ratio of 0.1.
|
||||
All models are trained with a maximum sequence length of 2048 tokens.
|
||||
All training is performed in \texttt{bfloat16} precision using the FusedAdam optimizer
|
||||
with momentum parameters $\beta_1 = 0.9$ and $\beta_2 = 0.95$.
|
||||
|
||||
|
||||
\subsection{Software and Environment}
|
||||
|
||||
The experiments were conducted using the following software packages and versions for reproducibility:
|
||||
|
||||
\begin{itemize}
|
||||
\item torch==2.1.2
|
||||
\item deepspeed==0.12.6
|
||||
\item numpy==1.26.4
|
||||
\item peft==0.16.0
|
||||
\item transformers==4.47.1
|
||||
\item tokenizers==0.21.2
|
||||
\item CUDA==12.1
|
||||
\end{itemize}
|
||||
|
||||
The hardware environment configuration is as follows:
|
||||
|
||||
\begin{itemize}[leftmargin=*]
|
||||
\item OS: Ubuntu 20.04 LTS
|
||||
\item CPU: Intel Xeon Silver 4214R
|
||||
\item GPU: NVIDIA GeForce RTX 3090
|
||||
\item Memory: 512GB RAM
|
||||
\end{itemize}
|
||||
Detailed implementation and datasets can be found in our codebase\footnote{\codelink}.
|
||||
|
||||
|
||||
\subsection{CASCADE Configuration}
|
||||
CASCADE applies heterogeneous frequency-domain and spatial-domain experts
|
||||
to all linear projection layers in the Transformer architecture.
|
||||
The low-frequency expert uses 20,000 DCT coefficients selected by Manhattan
|
||||
distance from the DC component.
|
||||
The high-frequency expert adopts a single-level 2D Haar wavelet transform
|
||||
with a total of 10,000 learnable coefficients distributed across the three
|
||||
detail subbands (LH, HL, HH), while the approximation subband is fixed to zero.
|
||||
The spatial residual expert is parameterized as a low-rank adapter with rank $r=48$.
|
||||
A lightweight routing module produces input-dependent expert weights.
|
||||
Auxiliary regularization includes load-balancing and spectral orthogonality losses,
|
||||
both weighted by 0.01.
|
||||
|
||||
\subsection{Baselines}
|
||||
We compare CASCADE with representative parameter-efficient fine-tuning methods
|
||||
spanning low-rank adaptation, frequency-domain parameterization,
|
||||
and mixture-of-experts approaches.
|
||||
These include LoRA, AdaLoRA, BONE, FourierFT, LoCA, and FlyLoRA.
|
||||
All baseline methods apply adapters to linear layers following the same configuration
|
||||
as CASCADE.
|
||||
|
||||
\section{Evaluation Protocol and Metrics}
|
||||
|
||||
\subsection{Generation Procedure.}
|
||||
All model outputs are generated using auto-regressive decoding via the \texttt{generate()} API in Hugging Face Transformers.
|
||||
We employ greedy decoding~(\texttt{do\_sample=False}), and set a maximum of 256 new tokens~(\texttt{max\_new\_tokens=256}).
|
||||
|
||||
Each input follows a unified instruction template, as shown below:
|
||||
\begin{tcolorbox}[boxrule=0.8pt]
|
||||
\textless s\textgreater Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
||||
|
||||
\#\#\# Instruction:\\
|
||||
\{instruction\}
|
||||
\\
|
||||
\\
|
||||
\#\#\# Response:
|
||||
\end{tcolorbox}
|
||||
|
||||
\subsection{Answer Extraction and Accuracy Calculation.}
|
||||
Results are calculated based on extracted predictions from generated outputs using task-specific regular expressions:
|
||||
|
||||
\begin{itemize}[leftmargin=*]
|
||||
\item \textit{Commonsense QA:} Extracted exact match answers (true/ false, solution/answer/ending options) and computed accuracy by direct matching against ground truth labels.
|
||||
\item \textit{Arithmetic QA:} Extracted numerical answers from output text (with absolute tolerance of $10^{-3}$) or alphabetic choices (A-E) for the AQuA dataset.
|
||||
\end{itemize}
|
||||
|
||||
All extraction and accuracy computation scripts are provided for reproducibility in our codebase.
|
||||
|
||||
|
||||
\section{Dataset Details}
|
||||
|
||||
\subsection{Training Datasets}
|
||||
We utilize two unified instruction-tuning datasets provided by LLM-Adapters~\cite{hu2023llm}:
|
||||
\begin{itemize}[leftmargin=*, topsep=0pt]
|
||||
\item \textbf{Commonsense15K} covers a wide range of commonsense reasoning questions. All examples are template-normalized into a consistent instruction format, supporting robust cross-task generalization.
|
||||
\item \textbf{Math10K} comprises diverse math word problems, each annotated with a step-by-step chain-of-thought solution and a final answer, enabling thorough evaluation of arithmetic reasoning under instruction-following settings.
|
||||
\end{itemize}
|
||||
The summary of dataset statistics is provided in Table~\ref{tab:dataset}.
|
||||
|
||||
\begin{table}[t]
|
||||
\centering
|
||||
\small
|
||||
\resizebox{0.95\linewidth}{!}{
|
||||
\renewcommand{\arraystretch}{1.01}
|
||||
\begin{tabular}{lccc}
|
||||
\toprule
|
||||
\textbf{Dataset} & \textbf{Samples} & \textbf{Total Tokens} & \textbf{Avg. Tokens/Sample} \\
|
||||
\midrule
|
||||
Commonsense15K & 15,119 & 1,778,782 & 117.65 \\
|
||||
Math10K & 9,919 & 2,273,016 & 229.16 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
}
|
||||
\caption{Statistics of the training datasets for commonsense and arithmetic QA tasks.
|
||||
}
|
||||
\label{tab:dataset}
|
||||
\end{table}
|
||||
|
||||
|
||||
|
||||
\noindent \textbf{a) Commonsense QA:}
|
||||
\begin{itemize}[leftmargin=1em]
|
||||
\item \textbf{BoolQ}~\cite{clark2019boolq}: BoolQ is a yes/no question answering dataset featuring naturally occurring, information-seeking queries and passage-based inference.
|
||||
\item \textbf{PIQA}~\cite{bisk2020piqa}: PIQA is a benchmark for physical commonsense reasoning, focused on practical everyday tasks with two candidate solutions.
|
||||
\item \textbf{SIQA}~\cite{sap2019socialiqa}: Social IQa is a multiple-choice benchmark that tests social and emotional commonsense reasoning in daily situations.
|
||||
\item \textbf{ARC-Challenge / ARC-Easy}~\cite{clark2018think}: The AI2 Reasoning Challenge (ARC) is a science question answering benchmark consisting of grade-school level, multiple-choice questions divided into Easy and Challenge subsets by difficulty.
|
||||
\item \textbf{OBQA}~\cite{mihaylov2018can}: OpenBookQA is a science question answering benchmark requiring multi-step reasoning over a provided set of core science facts.
|
||||
\item \textbf{HellaSwag}~\cite{zellers2019hellaswag}: HellaSwag is a natural language inference benchmark with adversarially-filtered continuations requiring robust commonsense reasoning.
|
||||
\item \textbf{WinoGrande}~\cite{sakaguchi2020winogrande}: WinoGrande is a binary fill-in-the-blank pronoun resolution benchmark designed to require advanced commonsense reasoning.
|
||||
\end{itemize}
|
||||
\begin{table}[t]
|
||||
\centering
|
||||
\small
|
||||
\resizebox{1\linewidth}{!}{
|
||||
\renewcommand{\arraystretch}{1.01}
|
||||
\begin{tabular}{lcc}
|
||||
\toprule
|
||||
\textbf{Dataset} & \textbf{Samples} & \textbf{Answer Format} \\
|
||||
\midrule
|
||||
BoolQ & 3,270 & true / false \\
|
||||
PIQA & 1,838 & solution1 / solution2 \\
|
||||
SIQA & 1,954 & answer1 / answer2 / answer3 \\
|
||||
ARC-Challenge & 1,172 & answer1 / answer2 / answer3 / answer4 \\
|
||||
ARC-Easy & 2,376 & answer1 / answer2 / answer3 / answer4 \\
|
||||
OBQA & 500 & answer1 / answer2 / answer3 / answer4 \\
|
||||
HellaSwag & 10,042 & ending1 / ending2 / ending3 / ending4 \\
|
||||
WinoGrande & 1,267 & option1 / option2 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
}
|
||||
\caption{Statistics of Commonsense QA Test Datasets.}
|
||||
\label{tab:commonsense-datasets}
|
||||
\end{table}
|
||||
|
||||
\noindent \textbf{b) Arithmetic QA:}
|
||||
\begin{itemize}[leftmargin=1em]
|
||||
\item \textbf{MultiArith}~\cite{roy2016solving}: MultiArith contains multi-step arithmetic word problems to evaluate a system's ability to handle complex reasoning chains.
|
||||
\item \textbf{GSM8K}~\cite{cobbe2021training}: GSM8K is a dataset of multiple linguistically diverse grade school math word problems, designed for benchmarking multi-step arithmetic reasoning with natural language solutions.
|
||||
\item \textbf{AddSub}~\cite{hosseini2014learning}: AddSub is a corpus of short word problems focused exclusively on addition and subtraction, used to assess basic arithmetic reasoning capabilities.
|
||||
\item \textbf{AQuA}~\cite{ling2017program}: AQuA is a large-scale dataset of algebraic word problems, each paired with natural language rationales to support step-by-step reasoning.
|
||||
\item \textbf{SingleEq}~\cite{koncel2015parsing}: SingleEq is a collection of multi-sentence algebraic word problems, emphasizing equation tree parsing and formal reasoning.
|
||||
\item \textbf{SVAMP}~\cite{patel2021nlp}: SVAMP is a challenge set constructed from elementary math word problems, aimed at evaluating a model's robustness to question sensitivity, structural variations, and reasoning challenges.
|
||||
\item \textbf{MAWPS}~\cite{koncel2016mawps}: MAWPS is a repository of multiple math word problems, offering a unified benchmark for evaluating models.
|
||||
\end{itemize}
|
||||
|
||||
\begin{table}[t]
|
||||
\centering
|
||||
\small
|
||||
\resizebox{0.8\linewidth}{!}{
|
||||
\renewcommand{\arraystretch}{0.95}
|
||||
\begin{tabular}{lcc}
|
||||
\toprule
|
||||
\textbf{Dataset} & \textbf{Samples} & \textbf{Answer Type} \\
|
||||
\midrule
|
||||
MultiArith & 600 & Numeric \\
|
||||
GSM8K & 1,319 & Numeric \\
|
||||
AddSub & 395 & Numeric \\
|
||||
AQuA & 254 & Multiple Choice (A--E) \\
|
||||
SingleEq & 508 & Numeric \\
|
||||
SVAMP & 1,000 & Numeric \\
|
||||
MAWPS & 238 & Numeric \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
}
|
||||
\caption{Statistics of Arithmetic QA Test Datasets.}
|
||||
\label{tab:arith-datasets}
|
||||
\end{table}
|
||||
|
||||
|
||||
|
||||
\subsection{Evaluation Benchmarks}
|
||||
We evaluate model performance on a suite of well-established commonsense and arithmetic QA benchmarks, enabling comprehensive evaluation of both generalization and robustness.
|
||||
Detailed statistics for all evaluation datasets can be found in Table~\ref{tab:commonsense-datasets}~(Commonsense) and Table \ref{tab:arith-datasets}~(Arithmetic).
|
||||
|
||||
830
mypaper/IJCAI2026_MESSA.bib
Executable file
830
mypaper/IJCAI2026_MESSA.bib
Executable file
@@ -0,0 +1,830 @@
|
||||
% Spar-Dataset
|
||||
@inproceedings{clark2019boolq,
|
||||
title={{B}ool{Q}: Exploring the Surprising Difficulty of Natural Yes/No Questions},
|
||||
author={Clark, Christopher and Lee, Kenton and Chang, Ming-Wei and Kwiatkowski, Tom and Collins, Michael and Toutanova, Kristina},
|
||||
booktitle={Proceedings of NAACL-HLT 2019},
|
||||
year={2019}
|
||||
}
|
||||
@misc{codealpaca,
|
||||
author = {Sahil Chaudhary},
|
||||
title = {Code Alpaca: An Instruction-following LLaMA model for code generation},
|
||||
year = {2023},
|
||||
publisher = {GitHub},
|
||||
journal = {GitHub repository},
|
||||
howpublished = {\url{https://github.com/sahil280114/codealpaca}},
|
||||
}
|
||||
@article{jin2020disease,
|
||||
title={What Disease does this Patient Have? A Large-scale Open Domain Question Answering Dataset from Medical Exams},
|
||||
author={Jin, Di and Pan, Eileen and Oufattole, Nassim and Weng, Wei-Hung and Fang, Hanyi and Szolovits, Peter},
|
||||
journal={arXiv preprint arXiv:2009.13081},
|
||||
year={2020}
|
||||
}
|
||||
@article{cobbe2021gsm8k,
|
||||
title={Training Verifiers to Solve Math Word Problems},
|
||||
author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and Hesse, Christopher and Schulman, John},
|
||||
journal={arXiv preprint arXiv:2110.14168},
|
||||
year={2021}
|
||||
}
|
||||
|
||||
@inproceedings{zellers2019hellaswag,
|
||||
title={HellaSwag: Can a Machine Really Finish Your Sentence?},
|
||||
author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin},
|
||||
booktitle ={Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
|
||||
year={2019}
|
||||
}
|
||||
% LLM
|
||||
@misc{qwen3technicalreport,
|
||||
title={Qwen3 Technical Report},
|
||||
author={Qwen Team},
|
||||
year={2025},
|
||||
eprint={2505.09388},
|
||||
archivePrefix={arXiv},
|
||||
primaryClass={cs.CL},
|
||||
url={https://arxiv.org/abs/2505.09388},
|
||||
}
|
||||
@article{grattafiori2024llama,
|
||||
title={The llama 3 herd of models},
|
||||
author={Grattafiori, Aaron and Dubey, Abhimanyu and Jauhri, Abhinav and Pandey, Abhinav and Kadian, Abhishek and Al-Dahle, Ahmad and Letman, Aiesha and Mathur, Akhil and Schelten, Alan and Vaughan, Alex and others},
|
||||
journal={arXiv preprint arXiv:2407.21783},
|
||||
year={2024}
|
||||
}
|
||||
@article{gemma_2025,
|
||||
title={Gemma 3},
|
||||
url={https://goo.gle/Gemma3Report},
|
||||
publisher={Kaggle},
|
||||
author={Gemma Team},
|
||||
year={2025}
|
||||
}
|
||||
|
||||
% Baseline
|
||||
@article{hu2021lora,
|
||||
title={Lora: Low-rank adaptation of large language models},
|
||||
author={Hu, Edward J and Shen, Yelong and Wallis, Phillip and Allen-Zhu, Zeyuan and Li, Yuanzhi and Wang, Shean and Wang, Lu and Chen, Weizhu},
|
||||
journal={arXiv preprint arXiv:2106.09685},
|
||||
year={2021}
|
||||
}
|
||||
@article{zhang2023adalora,
|
||||
title={Adalora: Adaptive budget allocation for parameter-efficient fine-tuning},
|
||||
author={Zhang, Qingru and Chen, Minshuo and Bukharin, Alexander and Karampatziakis, Nikos and He, Pengcheng and Cheng, Yu and Chen, Weizhu and Zhao, Tuo},
|
||||
journal={arXiv preprint arXiv:2303.10512},
|
||||
year={2023}
|
||||
}
|
||||
@inproceedings{mao2022unipelt,
|
||||
title={Unipelt: A unified framework for parameter-efficient language model tuning},
|
||||
author={Mao, Yuning and Mathias, Lambert and Hou, Rui and Almahairi, Amjad and Ma, Hao and Han, Jiawei and Yih, Scott and Khabsa, Madian},
|
||||
booktitle={Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
|
||||
pages={6253--6264},
|
||||
year={2022}
|
||||
}
|
||||
|
||||
@article{sheng2023s,
|
||||
title={S-lora: Serving thousands of concurrent lora adapters},
|
||||
author={Sheng, Ying and Cao, Shiyi and Li, Dacheng and Hooper, Coleman and Lee, Nicholas and Yang, Shuo and Chou, Christopher and Zhu, Banghua and Zheng, Lianmin and Keutzer, Kurt and others},
|
||||
journal={arXiv preprint arXiv:2311.03285},
|
||||
year={2023}
|
||||
}
|
||||
|
||||
@inproceedings{shiracite,
|
||||
author = {Bhardwaj, Kartikeya and Pandey, Nilesh Prasad and Priyadarshi, Sweta and Ganapathy, Viswanath and Kadambi, Shreya and Esteves, Rafael and Borse, Shubhankar and Whatmough, Paul and Garrepalli, Risheek and Van Baalen, Mart and Teague, Harris and Nagel, Markus},
|
||||
title = {Sparse high rank adapters},
|
||||
year = {2024},
|
||||
isbn = {9798331314385},
|
||||
publisher = {Curran Associates Inc.},
|
||||
address = {Red Hook, NY, USA},
|
||||
booktitle = {Proceedings of the 38th International Conference on Neural Information Processing Systems},
|
||||
articleno = {438},
|
||||
numpages = {31},
|
||||
location = {Vancouver, BC, Canada},
|
||||
series = {NIPS '24}
|
||||
}
|
||||
@inproceedings{agiza2024mtlora,
|
||||
title={MTLoRA: Low-Rank Adaptation Approach for Efficient Multi-Task Learning},
|
||||
author={Agiza, Ahmed and Neseem, Marina and Reda, Sherief},
|
||||
booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
|
||||
pages={16196--16205},
|
||||
year={2024}
|
||||
}
|
||||
@inproceedings{liu2024moe,
|
||||
title={When MOE Meets LLMs: Parameter Efficient Fine-tuning for Multi-task Medical Applications},
|
||||
author={Liu, Qidong and Wu, Xian and Zhao, Xiangyu and Zhu, Yuanshao and Xu, Derong and Tian, Feng and Zheng, Yefeng},
|
||||
booktitle={Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval},
|
||||
pages={1104--1114},
|
||||
year={2024}
|
||||
}
|
||||
% IJCAI
|
||||
@article{han2024parameter,
|
||||
title={Parameter-efficient fine-tuning for large models: A comprehensive survey},
|
||||
author={Han, Zeyu and Gao, Chao and Liu, Jinyang and Zhang, Jeff and Zhang, Sai Qian},
|
||||
journal={arXiv preprint arXiv:2403.14608},
|
||||
year={2024}
|
||||
}
|
||||
|
||||
|
||||
% adapter
|
||||
@inproceedings{houlsby2019parameter,
|
||||
title={Parameter-efficient transfer learning for NLP},
|
||||
author={Houlsby, Neil and Giurgiu, Andrei and Jastrzebski, Stanislaw and Morrone, Bruna and De Laroussilhe, Quentin and Gesmundo, Andrea and Attariyan, Mona and Gelly, Sylvain},
|
||||
booktitle={International conference on machine learning},
|
||||
pages={2790--2799},
|
||||
year={2019},
|
||||
organization={PMLR}
|
||||
}
|
||||
|
||||
% AAAING
|
||||
|
||||
% Datasets
|
||||
% GSM8K
|
||||
@article{cobbe2021training,
|
||||
title={Training verifiers to solve math word problems},
|
||||
author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and others},
|
||||
journal={arXiv preprint arXiv:2110.14168},
|
||||
year={2021}
|
||||
}
|
||||
% SVAMP
|
||||
@article{patel2021nlp,
|
||||
title={Are NLP models really able to solve simple math word problems?},
|
||||
author={Patel, Arkil and Bhattamishra, Satwik and Goyal, Navin},
|
||||
journal={arXiv preprint arXiv:2103.07191},
|
||||
year={2021}
|
||||
}
|
||||
% MultiArith
|
||||
@article{roy2016solving,
|
||||
title={Solving general arithmetic word problems},
|
||||
author={Roy, Subhro and Roth, Dan},
|
||||
journal={arXiv preprint arXiv:1608.01413},
|
||||
year={2016}
|
||||
}
|
||||
% Addsub
|
||||
@inproceedings{hosseini2014learning,
|
||||
title={Learning to solve arithmetic word problems with verb categorization},
|
||||
author={Hosseini, Mohammad Javad and Hajishirzi, Hannaneh and Etzioni, Oren and Kushman, Nate},
|
||||
booktitle={Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP)},
|
||||
pages={523--533},
|
||||
year={2014}
|
||||
}
|
||||
% AQuA
|
||||
@article{ling2017program,
|
||||
title={Program induction by rationale generation: Learning to solve and explain algebraic word problems},
|
||||
author={Ling, Wang and Yogatama, Dani and Dyer, Chris and Blunsom, Phil},
|
||||
journal={arXiv preprint arXiv:1705.04146},
|
||||
year={2017}
|
||||
}
|
||||
% SingleEq
|
||||
@article{koncel2015parsing,
|
||||
title={Parsing algebraic word problems into equations},
|
||||
author={Koncel-Kedziorski, Rik and Hajishirzi, Hannaneh and Sabharwal, Ashish and Etzioni, Oren and Ang, Siena Dumas},
|
||||
journal={Transactions of the Association for Computational Linguistics},
|
||||
volume={3},
|
||||
pages={585--597},
|
||||
year={2015},
|
||||
publisher={MIT Press One Rogers Street, Cambridge, MA 02142-1209, USA journals-info~…}
|
||||
}
|
||||
% MAWPS
|
||||
@inproceedings{koncel2016mawps,
|
||||
title={MAWPS: A math word problem repository},
|
||||
author={Koncel-Kedziorski, Rik and Roy, Subhro and Amini, Aida and Kushman, Nate and Hajishirzi, Hannaneh},
|
||||
booktitle={Proceedings of the 2016 conference of the north american chapter of the association for computational linguistics: human language technologies},
|
||||
pages={1152--1157},
|
||||
year={2016}
|
||||
}
|
||||
% PIQA
|
||||
@inproceedings{bisk2020piqa,
|
||||
title={Piqa: Reasoning about physical commonsense in natural language},
|
||||
author={Bisk, Yonatan and Zellers, Rowan and Gao, Jianfeng and Choi, Yejin and others},
|
||||
booktitle={Proceedings of the AAAI conference on artificial intelligence},
|
||||
volume={34},
|
||||
number={05},
|
||||
pages={7432--7439},
|
||||
year={2020}
|
||||
}
|
||||
% SIQA
|
||||
@article{sap2019socialiqa,
|
||||
title={Socialiqa: Commonsense reasoning about social interactions},
|
||||
author={Sap, Maarten and Rashkin, Hannah and Chen, Derek and LeBras, Ronan and Choi, Yejin},
|
||||
journal={arXiv preprint arXiv:1904.09728},
|
||||
year={2019}
|
||||
}
|
||||
|
||||
% WN
|
||||
@inproceedings{sakaguchi2020winogrande,
|
||||
title={Winogrande: An adversarial winograd schema challenge at scale},
|
||||
author={Sakaguchi, Keisuke and Le Bras, Ronan and Bhagavatula, Chandra and Choi, Yejin},
|
||||
booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
|
||||
volume={34},
|
||||
number={05},
|
||||
pages={8732--8740},
|
||||
year={2020}
|
||||
}
|
||||
% ARC
|
||||
@article{clark2018think,
|
||||
title={Think you have solved question answering? try arc, the ai2 reasoning challenge},
|
||||
author={Clark, Peter and Cowhey, Isaac and Etzioni, Oren and Khot, Tushar and Sabharwal, Ashish and Schoenick, Carissa and Tafjord, Oyvind},
|
||||
journal={arXiv preprint arXiv:1803.05457},
|
||||
year={2018}
|
||||
}
|
||||
% OBDA
|
||||
@article{mihaylov2018can,
|
||||
title={Can a suit of armor conduct electricity? a new dataset for open book question answering},
|
||||
author={Mihaylov, Todor and Clark, Peter and Khot, Tushar and Sabharwal, Ashish},
|
||||
journal={arXiv preprint arXiv:1809.02789},
|
||||
year={2018}
|
||||
}
|
||||
|
||||
% Related
|
||||
@article{ansell2024scaling,
|
||||
title={Scaling sparse fine-tuning to large language models},
|
||||
author={Ansell, Alan and Vuli{\'c}, Ivan and Sterz, Hannah and Korhonen, Anna and Ponti, Edoardo M},
|
||||
journal={arXiv preprint arXiv:2401.16405},
|
||||
year={2024}
|
||||
}
|
||||
@article{sanh2020movement,
|
||||
title={Movement pruning: Adaptive sparsity by fine-tuning},
|
||||
author={Sanh, Victor and Wolf, Thomas and Rush, Alexander},
|
||||
journal={Advances in neural information processing systems},
|
||||
volume={33},
|
||||
pages={20378--20389},
|
||||
year={2020}
|
||||
}
|
||||
|
||||
@article{li2021prefix,
|
||||
title={Prefix-tuning: Optimizing continuous prompts for generation},
|
||||
author={Li, Xiang Lisa and Liang, Percy},
|
||||
journal={arXiv preprint arXiv:2101.00190},
|
||||
year={2021}
|
||||
}
|
||||
@article{dong2025attention,
|
||||
title={Attention Retrieves, MLP Memorizes: Disentangling Trainable Components in the Transformer},
|
||||
author={Dong, Yihe and Noci, Lorenzo and Khodak, Mikhail and Li, Mufan},
|
||||
journal={arXiv preprint arXiv:2506.01115},
|
||||
year={2025}
|
||||
}
|
||||
@article{michel2019sixteen,
|
||||
title={Are sixteen heads really better than one?},
|
||||
author={Michel, Paul and Levy, Omer and Neubig, Graham},
|
||||
journal={Advances in neural information processing systems},
|
||||
volume={32},
|
||||
year={2019}
|
||||
}
|
||||
@article{belinkov2018evaluating,
|
||||
title={Evaluating layers of representation in neural machine translation on part-of-speech and semantic tagging tasks},
|
||||
author={Belinkov, Yonatan and M{\`a}rquez, Llu{\'\i}s and Sajjad, Hassan and Durrani, Nadir and Dalvi, Fahim and Glass, James},
|
||||
journal={arXiv preprint arXiv:1801.07772},
|
||||
year={2018}
|
||||
}
|
||||
% Others
|
||||
@article{ding2023parameter,
|
||||
title={Parameter-efficient fine-tuning of large-scale pre-trained language models},
|
||||
author={Ding, Ning and Qin, Yujia and Yang, Guang and Wei, Fuchao and Yang, Zonghan and Su, Yusheng and Hu, Shengding and Chen, Yulin and Chan, Chi-Min and Chen, Weize and others},
|
||||
journal={Nature machine intelligence},
|
||||
volume={5},
|
||||
number={3},
|
||||
pages={220--235},
|
||||
year={2023},
|
||||
publisher={Nature Publishing Group UK London}
|
||||
}
|
||||
@article{peng2023instruction,
|
||||
title={Instruction tuning with gpt-4},
|
||||
author={Peng, Baolin and Li, Chunyuan and He, Pengcheng and Galley, Michel and Gao, Jianfeng},
|
||||
journal={arXiv preprint arXiv:2304.03277},
|
||||
year={2023}
|
||||
}
|
||||
|
||||
% Baselines
|
||||
@article{liu2024dora,
|
||||
title={Dora: Weight-decomposed low-rank adaptation},
|
||||
author={Liu, Shih-Yang and Wang, Chien-Yi and Yin, Hongxu and Molchanov, Pavlo and Wang, Yu-Chiang Frank and Cheng, Kwang-Ting and Chen, Min-Hung},
|
||||
journal={arXiv preprint arXiv:2402.09353},
|
||||
year={2024}
|
||||
}
|
||||
|
||||
|
||||
% C3A
|
||||
@article{chen2024parameter,
|
||||
title={Parameter-efficient fine-tuning via circular convolution},
|
||||
author={Chen, Aochuan and Cheng, Jiashun and Liu, Zijing and Gao, Ziqi and Tsung, Fugee and Li, Yu and Li, Jia},
|
||||
journal={arXiv preprint arXiv:2407.19342},
|
||||
year={2024}
|
||||
}
|
||||
% BONE
|
||||
@article{kang2024balancing,
|
||||
title={Balancing LoRA Performance and Efficiency with Simple Shard Sharing},
|
||||
author={Kang, Jiale and Yin, Qingyu},
|
||||
journal={arXiv preprint arXiv:2409.15371},
|
||||
year={2024}
|
||||
}
|
||||
% VERA-EDITED
|
||||
@article{kopiczko2023vera,
|
||||
title={Vera: Vector-based random matrix adaptation},
|
||||
author={{Kopiczko et al.}},
|
||||
journal={arXiv preprint arXiv:2310.11454},
|
||||
year={2023}
|
||||
}
|
||||
% BOFT
|
||||
@article{liu2023parameter,
|
||||
title={Parameter-efficient orthogonal finetuning via butterfly factorization},
|
||||
author={Liu, Weiyang and Qiu, Zeju and Feng, Yao and Xiu, Yuliang and Xue, Yuxuan and Yu, Longhui and Feng, Haiwen and Liu, Zhen and Heo, Juyeon and Peng, Songyou and others},
|
||||
journal={arXiv preprint arXiv:2311.06243},
|
||||
year={2023}
|
||||
}
|
||||
% LN-Tuning
|
||||
@article{zhao2023tuning,
|
||||
title={Tuning layernorm in attention: Towards efficient multi-modal llm finetuning},
|
||||
author={Zhao, Bingchen and Tu, Haoqin and Wei, Chen and Mei, Jieru and Xie, Cihang},
|
||||
journal={arXiv preprint arXiv:2312.11420},
|
||||
year={2023}
|
||||
}
|
||||
|
||||
% Deepspeed
|
||||
@inproceedings{rasley2020deepspeed,
|
||||
title={Deepspeed: System optimizations enable training deep learning models with over 100 billion parameters},
|
||||
author={Rasley, Jeff and Rajbhandari, Samyam and Ruwase, Olatunji and He, Yuxiong},
|
||||
booktitle={Proceedings of the 26th ACM SIGKDD international conference on knowledge discovery \& data mining},
|
||||
pages={3505--3506},
|
||||
year={2020}
|
||||
}
|
||||
% Huggingface Transformers
|
||||
@inproceedings{wolf2020transformers,
|
||||
title={Transformers: State-of-the-art natural language processing},
|
||||
author={Wolf, Thomas and Debut, Lysandre and Sanh, Victor and Chaumond, Julien and Delangue, Clement and Moi, Anthony and Cistac, Pierric and Rault, Tim and Louf, Remi and Funtowicz, Morgan and others},
|
||||
booktitle={Proceedings of the 2020 conference on empirical methods in natural language processing: system demonstrations},
|
||||
pages={38--45},
|
||||
year={2020}
|
||||
}
|
||||
|
||||
@inproceedings{geva2021transformer,
|
||||
title={Transformer Feed-Forward Layers Are Key-Value Memories},
|
||||
author={Geva, Mor and Schuster, Roei and Berant, Jonathan and Levy, Omer},
|
||||
booktitle={Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing},
|
||||
pages={5484--5495},
|
||||
year={2021}
|
||||
}
|
||||
|
||||
@article{su2024roformer,
|
||||
title={Roformer: Enhanced transformer with rotary position embedding},
|
||||
author={Su, Jianlin and Ahmed, Murtadha and Lu, Yu and Pan, Shengfeng and Bo, Wen and Liu, Yunfeng},
|
||||
journal={Neurocomputing},
|
||||
volume={568},
|
||||
pages={127063},
|
||||
year={2024},
|
||||
publisher={Elsevier}
|
||||
}
|
||||
@article{barbero2024round,
|
||||
title={Round and round we go! what makes rotary positional encodings useful?},
|
||||
author={Barbero, Federico and Vitvitskyi, Alex and Perivolaropoulos, Christos and Pascanu, Razvan and Veli{\v{c}}kovi{\'c}, Petar},
|
||||
journal={arXiv preprint arXiv:2410.06205},
|
||||
year={2024}
|
||||
}
|
||||
|
||||
@article{jin2025massive,
|
||||
title={Massive Values in Self-Attention Modules are the Key to Contextual Knowledge Understanding},
|
||||
author={Jin, Mingyu and Mei, Kai and Xu, Wujiang and Sun, Mingjie and Tang, Ruixiang and Du, Mengnan and Liu, Zirui and Zhang, Yongfeng},
|
||||
journal={arXiv preprint arXiv:2502.01563},
|
||||
year={2025}
|
||||
}
|
||||
@article{vaswani2017attention,
|
||||
title={Attention is all you need},
|
||||
author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
|
||||
journal={Advances in neural information processing systems},
|
||||
volume={30},
|
||||
year={2017}
|
||||
}
|
||||
@article{touvron2023llama,
|
||||
title={Llama: Open and efficient foundation language models},
|
||||
author={Touvron, Hugo and Lavril, Thibaut and Izacard, Gautier and Martinet, Xavier and Lachaux, Marie-Anne and Lacroix, Timoth{\'e}e and Rozi{\`e}re, Baptiste and Goyal, Naman and Hambro, Eric and Azhar, Faisal and others},
|
||||
journal={arXiv preprint arXiv:2302.13971},
|
||||
year={2023}
|
||||
}
|
||||
@article{shazeer2020glu,
|
||||
title={Glu variants improve transformer},
|
||||
author={Shazeer, Noam},
|
||||
journal={arXiv preprint arXiv:2002.05202},
|
||||
year={2020}
|
||||
}
|
||||
@inproceedings{he2016deep,
|
||||
title={Deep residual learning for image recognition},
|
||||
author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
|
||||
booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
|
||||
pages={770--778},
|
||||
year={2016}
|
||||
}
|
||||
@article{bai2023qwen,
|
||||
title={Qwen technical report},
|
||||
author={Bai, Jinze and Bai, Shuai and Chu, Yunfei and Cui, Zeyu and Dang, Kai and Deng, Xiaodong and Fan, Yang and Ge, Wenbin and Han, Yu and Huang, Fei and others},
|
||||
journal={arXiv preprint arXiv:2309.16609},
|
||||
year={2023}
|
||||
}
|
||||
|
||||
% SiLU
|
||||
@article{elfwing2018sigmoid,
|
||||
title={Sigmoid-weighted linear units for neural network function approximation in reinforcement learning},
|
||||
author={Elfwing, Stefan and Uchibe, Eiji and Doya, Kenji},
|
||||
journal={Neural networks},
|
||||
volume={107},
|
||||
pages={3--11},
|
||||
year={2018},
|
||||
publisher={Elsevier}
|
||||
}
|
||||
@article{ainslie2023gqa,
|
||||
title={Gqa: Training generalized multi-query transformer models from multi-head checkpoints},
|
||||
author={Ainslie, Joshua and Lee-Thorp, James and De Jong, Michiel and Zemlyanskiy, Yury and Lebr{\'o}n, Federico and Sanghai, Sumit},
|
||||
journal={arXiv preprint arXiv:2305.13245},
|
||||
year={2023}
|
||||
}
|
||||
@article{voita2019bottom,
|
||||
title={The bottom-up evolution of representations in the transformer: A study with machine translation and language modeling objectives},
|
||||
author={Voita, Elena and Sennrich, Rico and Titov, Ivan},
|
||||
journal={arXiv preprint arXiv:1909.01380},
|
||||
year={2019}
|
||||
}
|
||||
@article{hu2023llm,
|
||||
title={Llm-adapters: An adapter family for parameter-efficient fine-tuning of large language models},
|
||||
author={Hu, Zhiqiang and Wang, Lei and Lan, Yihuai and Xu, Wanyu and Lim, Ee-Peng and Bing, Lidong and Xu, Xing and Poria, Soujanya and Lee, Roy Ka-Wei},
|
||||
journal={arXiv preprint arXiv:2304.01933},
|
||||
year={2023}
|
||||
}
|
||||
@article{team2024gemma,
|
||||
title={Gemma 2: Improving open language models at a practical size},
|
||||
author={Team, Gemma and Riviere, Morgane and Pathak, Shreya and Sessa, Pier Giuseppe and Hardin, Cassidy and Bhupatiraju, Surya and Hussenot, L{\'e}onard and Mesnard, Thomas and Shahriari, Bobak and Ram{\'e}, Alexandre and others},
|
||||
journal={arXiv preprint arXiv:2408.00118},
|
||||
year={2024}
|
||||
}
|
||||
@article{dubey2024llama,
|
||||
title={The llama 3 herd of models},
|
||||
author={Dubey, Abhimanyu and Jauhri, Abhinav and Pandey, Abhinav and Kadian, Abhishek and Al-Dahle, Ahmad and Letman, Aiesha and Mathur, Akhil and Schelten, Alan and Yang, Amy and Fan, Angela and others},
|
||||
journal={arXiv e-prints},
|
||||
pages={arXiv--2407},
|
||||
year={2024}
|
||||
}
|
||||
@article{team2024qwen2,
|
||||
title={Qwen2 technical report},
|
||||
author={Team, Qwen},
|
||||
journal={arXiv preprint arXiv:2407.10671},
|
||||
year={2024}
|
||||
}
|
||||
% Old
|
||||
|
||||
@article{sun2025stronger,
|
||||
title={A Stronger Mixture of Low-Rank Experts for Fine-Tuning Foundation Models},
|
||||
author={Sun, Mengyang and Wang, Yihao and Feng, Tao and Zhang, Dan and Zhu, Yifan and Tang, Jie},
|
||||
journal={arXiv preprint arXiv:2502.15828},
|
||||
year={2025}
|
||||
}
|
||||
@article{pfeiffer2020mad,
|
||||
title={Mad-x: An adapter-based framework for multi-task cross-lingual transfer},
|
||||
author={Pfeiffer, Jonas and Vuli{\'c}, Ivan and Gurevych, Iryna and Ruder, Sebastian},
|
||||
journal={arXiv preprint arXiv:2005.00052},
|
||||
year={2020}
|
||||
}
|
||||
@article{raffel2020exploring,
|
||||
title={Exploring the limits of transfer learning with a unified text-to-text transformer},
|
||||
author={Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J},
|
||||
journal={Journal of machine learning research},
|
||||
volume={21},
|
||||
number={140},
|
||||
pages={1--67},
|
||||
year={2020}
|
||||
}
|
||||
@article{zaken2021bitfit,
|
||||
title={Bitfit: Simple parameter-efficient fine-tuning for transformer-based masked language-models},
|
||||
author={Zaken, Elad Ben and Ravfogel, Shauli and Goldberg, Yoav},
|
||||
journal={arXiv preprint arXiv:2106.10199},
|
||||
year={2021}
|
||||
}
|
||||
@inproceedings{papineni2002bleu,
|
||||
title={Bleu: a method for automatic evaluation of machine translation},
|
||||
author={Papineni, Kishore and Roukos, Salim and Ward, Todd and Zhu, Wei-Jing},
|
||||
booktitle={Proceedings of the 40th annual meeting of the Association for Computational Linguistics},
|
||||
pages={311--318},
|
||||
year={2002}
|
||||
}
|
||||
@inproceedings{lin2004rouge,
|
||||
title={Rouge: A package for automatic evaluation of summaries},
|
||||
author={Lin, Chin-Yew},
|
||||
booktitle={Text summarization branches out},
|
||||
pages={74--81},
|
||||
year={2004}
|
||||
}
|
||||
@article{jang2016categorical,
|
||||
title={Categorical reparameterization with gumbel-softmax},
|
||||
author={Jang, Eric and Gu, Shixiang and Poole, Ben},
|
||||
journal={arXiv preprint arXiv:1611.01144},
|
||||
year={2016}
|
||||
}
|
||||
@inproceedings{he2015delving,
|
||||
title={Delving deep into rectifiers: Surpassing human-level performance on imagenet classification},
|
||||
author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
|
||||
booktitle={Proceedings of the IEEE international conference on computer vision},
|
||||
pages={1026--1034},
|
||||
year={2015}
|
||||
}
|
||||
@article{guo2025nlora,
|
||||
title={NLoRA: Nystr$\backslash$" om-Initiated Low-Rank Adaptation for Large Language Models},
|
||||
author={Guo, Chenlu and Wu, Yuan and Chang, Yi},
|
||||
journal={arXiv preprint arXiv:2502.14482},
|
||||
year={2025}
|
||||
}
|
||||
|
||||
@article{ba2016layer,
|
||||
title={Layer normalization},
|
||||
author={Ba, Jimmy Lei and Kiros, Jamie Ryan and Hinton, Geoffrey E},
|
||||
journal={arXiv preprint arXiv:1607.06450},
|
||||
year={2016}
|
||||
}
|
||||
|
||||
@article{team2023gemini,
|
||||
title={Gemini: a family of highly capable multimodal models},
|
||||
author={Team, Gemini and Anil, Rohan and Borgeaud, Sebastian and Alayrac, Jean-Baptiste and Yu, Jiahui and Soricut, Radu and Schalkwyk, Johan and Dai, Andrew M and Hauth, Anja and Millican, Katie and others},
|
||||
journal={arXiv preprint arXiv:2312.11805},
|
||||
year={2023}
|
||||
}
|
||||
@article{liu2023moelora,
|
||||
title={Moelora: An moe-based parameter efficient fine-tuning method for multi-task medical applications},
|
||||
author={Liu, Qidong and Wu, Xian and Zhao, Xiangyu and Zhu, Yuanshao and Xu, Derong and Tian, Feng and Zheng, Yefeng},
|
||||
journal={arXiv preprint arXiv:2310.18339},
|
||||
year={2023}
|
||||
}
|
||||
@article{wang2023multilora,
|
||||
title={Multilora: Democratizing lora for better multi-task learning},
|
||||
author={Wang, Yiming and Lin, Yu and Zeng, Xiaodong and Zhang, Guannan},
|
||||
journal={arXiv preprint arXiv:2311.11501},
|
||||
year={2023}
|
||||
}
|
||||
@article{liu2021p,
|
||||
title={P-tuning v2: Prompt tuning can be comparable to fine-tuning universally across scales and tasks},
|
||||
author={Liu, Xiao and Ji, Kaixuan and Fu, Yicheng and Tam, Weng Lam and Du, Zhengxiao and Yang, Zhilin and Tang, Jie},
|
||||
journal={arXiv preprint arXiv:2110.07602},
|
||||
year={2021}
|
||||
}
|
||||
@article{brown2020language,
|
||||
title={Language models are few-shot learners},
|
||||
author={Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and others},
|
||||
journal={Advances in neural information processing systems},
|
||||
volume={33},
|
||||
pages={1877--1901},
|
||||
year={2020}
|
||||
}
|
||||
@article{liu2021conflict,
|
||||
title={Conflict-averse gradient descent for multi-task learning},
|
||||
author={Liu, Bo and Liu, Xingchao and Jin, Xiaojie and Stone, Peter and Liu, Qiang},
|
||||
journal={Advances in Neural Information Processing Systems},
|
||||
volume={34},
|
||||
pages={18878--18890},
|
||||
year={2021}
|
||||
}
|
||||
@article{navon2022multi,
|
||||
title={Multi-task learning as a bargaining game},
|
||||
author={Navon, Aviv and Shamsian, Aviv and Achituve, Idan and Maron, Haggai and Kawaguchi, Kenji and Chechik, Gal and Fetaya, Ethan},
|
||||
journal={arXiv preprint arXiv:2202.01017},
|
||||
year={2022}
|
||||
}
|
||||
@article{yu2020gradient,
|
||||
title={Gradient surgery for multi-task learning},
|
||||
author={Yu, Tianhe and Kumar, Saurabh and Gupta, Abhishek and Levine, Sergey and Hausman, Karol and Finn, Chelsea},
|
||||
journal={Advances in Neural Information Processing Systems},
|
||||
volume={33},
|
||||
pages={5824--5836},
|
||||
year={2020}
|
||||
}
|
||||
@article{renduchintala2023tied,
|
||||
title={Tied-lora: Enhacing parameter efficiency of lora with weight tying},
|
||||
author={Renduchintala, Adithya and Konuk, Tugrul and Kuchaiev, Oleksii},
|
||||
journal={arXiv preprint arXiv:2311.09578},
|
||||
year={2023}
|
||||
}
|
||||
@inproceedings{kwon2023efficient,
|
||||
title={Efficient memory management for large language model serving with pagedattention},
|
||||
author={Kwon, Woosuk and Li, Zhuohan and Zhuang, Siyuan and Sheng, Ying and Zheng, Lianmin and Yu, Cody Hao and Gonzalez, Joseph and Zhang, Hao and Stoica, Ion},
|
||||
booktitle={Proceedings of the 29th Symposium on Operating Systems Principles},
|
||||
pages={611--626},
|
||||
year={2023}
|
||||
}
|
||||
|
||||
@article{dai2024deepseekmoe,
|
||||
title={Deepseekmoe: Towards ultimate expert specialization in mixture-of-experts language models},
|
||||
author={Dai, Damai and Deng, Chengqi and Zhao, Chenggang and Xu, RX and Gao, Huazuo and Chen, Deli and Li, Jiashi and Zeng, Wangding and Yu, Xingkai and Wu, Y and others},
|
||||
journal={arXiv preprint arXiv:2401.06066},
|
||||
year={2024}
|
||||
}
|
||||
@article{guo2025deepseek,
|
||||
title={Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning},
|
||||
author={Guo, Daya and Yang, Dejian and Zhang, Haowei and Song, Junxiao and Zhang, Ruoyu and Xu, Runxin and Zhu, Qihao and Ma, Shirong and Wang, Peiyi and Bi, Xiao and others},
|
||||
journal={arXiv preprint arXiv:2501.12948},
|
||||
year={2025}
|
||||
}
|
||||
@article{shazeer2017outrageously,
|
||||
title={Outrageously large neural networks: The sparsely-gated mixture-of-experts layer},
|
||||
author={Shazeer, Noam and Mirhoseini, Azalia and Maziarz, Krzysztof and Davis, Andy and Le, Quoc and Hinton, Geoffrey and Dean, Jeff},
|
||||
journal={arXiv preprint arXiv:1701.06538},
|
||||
year={2017}
|
||||
}
|
||||
@inproceedings{rajbhandari2022deepspeed,
|
||||
title={Deepspeed-moe: Advancing mixture-of-experts inference and training to power next-generation ai scale},
|
||||
author={Rajbhandari, Samyam and Li, Conglong and Yao, Zhewei and Zhang, Minjia and Aminabadi, Reza Yazdani and Awan, Ammar Ahmad and Rasley, Jeff and He, Yuxiong},
|
||||
booktitle={International conference on machine learning},
|
||||
pages={18332--18346},
|
||||
year={2022},
|
||||
organization={PMLR}
|
||||
}
|
||||
@article{zhang2023instruction,
|
||||
title={Instruction tuning for large language models: A survey},
|
||||
author={Zhang, Shengyu and Dong, Linfeng and Li, Xiaoya and Zhang, Sen and Sun, Xiaofei and Wang, Shuhe and Li, Jiwei and Hu, Runyi and Zhang, Tianwei and Wu, Fei and others},
|
||||
journal={arXiv preprint arXiv:2308.10792},
|
||||
year={2023}
|
||||
}
|
||||
@article{pfeiffer2020adapterfusion,
|
||||
title={Adapterfusion: Non-destructive task composition for transfer learning},
|
||||
author={Pfeiffer, Jonas and Kamath, Aishwarya and R{\"u}ckl{\'e}, Andreas and Cho, Kyunghyun and Gurevych, Iryna},
|
||||
journal={arXiv preprint arXiv:2005.00247},
|
||||
year={2020}
|
||||
}
|
||||
@article{pfeiffer2020adapterhub,
|
||||
title={Adapterhub: A framework for adapting transformers},
|
||||
author={Pfeiffer, Jonas and R{\"u}ckl{\'e}, Andreas and Poth, Clifton and Kamath, Aishwarya and Vuli{\'c}, Ivan and Ruder, Sebastian and Cho, Kyunghyun and Gurevych, Iryna},
|
||||
journal={arXiv preprint arXiv:2007.07779},
|
||||
year={2020}
|
||||
}
|
||||
@article{lialin2023scaling,
|
||||
title={Scaling down to scale up: A guide to parameter-efficient fine-tuning},
|
||||
author={Lialin, Vladislav and Deshpande, Vijeta and Rumshisky, Anna},
|
||||
journal={arXiv preprint arXiv:2303.15647},
|
||||
year={2023}
|
||||
}
|
||||
|
||||
@article{lu2023uniadapter,
|
||||
title={Uniadapter: Unified parameter-efficient transfer learning for cross-modal modeling},
|
||||
author={Lu, Haoyu and Huo, Yuqi and Yang, Guoxing and Lu, Zhiwu and Zhan, Wei and Tomizuka, Masayoshi and Ding, Mingyu},
|
||||
journal={arXiv preprint arXiv:2302.06605},
|
||||
year={2023}
|
||||
}
|
||||
|
||||
@article{fedus2022switch,
|
||||
title={Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity},
|
||||
author={Fedus, William and Zoph, Barret and Shazeer, Noam},
|
||||
journal={Journal of Machine Learning Research},
|
||||
volume={23},
|
||||
number={120},
|
||||
pages={1--39},
|
||||
year={2022}
|
||||
}
|
||||
@article{lepikhin2020gshard,
|
||||
title={Gshard: Scaling giant models with conditional computation and automatic sharding},
|
||||
author={Lepikhin, Dmitry and Lee, HyoukJoong and Xu, Yuanzhong and Chen, Dehao and Firat, Orhan and Huang, Yanping and Krikun, Maxim and Shazeer, Noam and Chen, Zhifeng},
|
||||
journal={arXiv preprint arXiv:2006.16668},
|
||||
year={2020}
|
||||
}
|
||||
@article{luo2024moelora,
|
||||
title={Moelora: Contrastive learning guided mixture of experts on parameter-efficient fine-tuning for large language models},
|
||||
author={Luo, Tongxu and Lei, Jiahe and Lei, Fangyu and Liu, Weihao and He, Shizhu and Zhao, Jun and Liu, Kang},
|
||||
journal={arXiv preprint arXiv:2402.12851},
|
||||
year={2024}
|
||||
}
|
||||
@article{guo2024large,
|
||||
title={Large language model based multi-agents: A survey of progress and challenges},
|
||||
author={Guo, Taicheng and Chen, Xiuying and Wang, Yaqi and Chang, Ruidi and Pei, Shichao and Chawla, Nitesh V and Wiest, Olaf and Zhang, Xiangliang},
|
||||
journal={arXiv preprint arXiv:2402.01680},
|
||||
year={2024}
|
||||
}
|
||||
@article{zhao2023survey,
|
||||
title={A survey of large language models},
|
||||
author={Zhao, Wayne Xin and Zhou, Kun and Li, Junyi and Tang, Tianyi and Wang, Xiaolei and Hou, Yupeng and Min, Yingqian and Zhang, Beichen and Zhang, Junjie and Dong, Zican and others},
|
||||
journal={arXiv preprint arXiv:2303.18223},
|
||||
year={2023}
|
||||
}
|
||||
@article{gao2024higher,
|
||||
title={Higher layers need more lora experts},
|
||||
author={Gao, Chongyang and Chen, Kezhen and Rao, Jinmeng and Sun, Baochen and Liu, Ruibo and Peng, Daiyi and Zhang, Yawen and Guo, Xiaoyuan and Yang, Jie and Subrahmanian, VS},
|
||||
journal={arXiv preprint arXiv:2402.08562},
|
||||
year={2024}
|
||||
}
|
||||
@inproceedings{dou2024loramoe,
|
||||
title={LoRAMoE: Alleviating world knowledge forgetting in large language models via MoE-style plugin},
|
||||
author={Dou, Shihan and Zhou, Enyu and Liu, Yan and Gao, Songyang and Shen, Wei and Xiong, Limao and Zhou, Yuhao and Wang, Xiao and Xi, Zhiheng and Fan, Xiaoran and others},
|
||||
booktitle={Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
|
||||
pages={1932--1945},
|
||||
year={2024}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@article{achiam2023gpt,
|
||||
title={Gpt-4 technical report},
|
||||
author={Achiam, Josh and Adler, Steven and Agarwal, Sandhini and Ahmad, Lama and Akkaya, Ilge and Aleman, Florencia Leoni and Almeida, Diogo and Altenschmidt, Janko and Altman, Sam and Anadkat, Shyamal and others},
|
||||
journal={arXiv preprint arXiv:2303.08774},
|
||||
year={2023}
|
||||
}
|
||||
@article{jaszczur2021sparse,
|
||||
title={Sparse is enough in scaling transformers},
|
||||
author={Jaszczur, Sebastian and Chowdhery, Aakanksha and Mohiuddin, Afroz and Kaiser, Lukasz and Gajewski, Wojciech and Michalewski, Henryk and Kanerva, Jonni},
|
||||
journal={Advances in Neural Information Processing Systems},
|
||||
volume={34},
|
||||
pages={9895--9907},
|
||||
year={2021}
|
||||
}
|
||||
@inproceedings{standley2020tasks,
|
||||
title={Which tasks should be learned together in multi-task learning?},
|
||||
author={Standley, Trevor and Zamir, Amir and Chen, Dawn and Guibas, Leonidas and Malik, Jitendra and Savarese, Silvio},
|
||||
booktitle={International conference on machine learning},
|
||||
pages={9120--9132},
|
||||
year={2020},
|
||||
organization={PMLR}
|
||||
}
|
||||
@article{cai2024survey,
|
||||
title={A survey on mixture of experts},
|
||||
author={Cai, Weilin and Jiang, Juyong and Wang, Fan and Tang, Jing and Kim, Sunghun and Huang, Jiayi},
|
||||
journal={arXiv preprint arXiv:2407.06204},
|
||||
year={2024}
|
||||
}
|
||||
@article{karimi2021compacter,
|
||||
title={Compacter: Efficient low-rank hypercomplex adapter layers},
|
||||
author={Karimi Mahabadi, Rabeeh and Henderson, James and Ruder, Sebastian},
|
||||
journal={Advances in Neural Information Processing Systems},
|
||||
volume={34},
|
||||
pages={1022--1035},
|
||||
year={2021}
|
||||
}
|
||||
@article{bommasani2021opportunities,
|
||||
title={On the opportunities and risks of foundation models},
|
||||
author={Bommasani, Rishi and Hudson, Drew A and Adeli, Ehsan and Altman, Russ and Arora, Simran and von Arx, Sydney and Bernstein, Michael S and Bohg, Jeannette and Bosselut, Antoine and Brunskill, Emma and others},
|
||||
journal={arXiv preprint arXiv:2108.07258},
|
||||
year={2021}
|
||||
}
|
||||
@article{pan2024lisa,
|
||||
title={LISA: Layerwise Importance Sampling for Memory-Efficient Large Language Model Fine-Tuning},
|
||||
author={Pan, Rui and Liu, Xiang and Diao, Shizhe and Pi, Renjie and Zhang, Jipeng and Han, Chi and Zhang, Tong},
|
||||
journal={arXiv preprint arXiv:2403.17919},
|
||||
year={2024}
|
||||
}
|
||||
@article{feng2024mixture,
|
||||
title={Mixture-of-loras: An efficient multitask tuning for large language models},
|
||||
author={Feng, Wenfeng and Hao, Chuzhan and Zhang, Yuewei and Han, Yu and Wang, Hao},
|
||||
journal={arXiv preprint arXiv:2403.03432},
|
||||
year={2024}
|
||||
}
|
||||
@article{lester2021power,
|
||||
title={The power of scale for parameter-efficient prompt tuning},
|
||||
author={Lester, Brian and Al-Rfou, Rami and Constant, Noah},
|
||||
journal={arXiv preprint arXiv:2104.08691},
|
||||
year={2021}
|
||||
}
|
||||
@article{zhou2024lima,
|
||||
title={Lima: Less is more for alignment},
|
||||
author={Zhou, Chunting and Liu, Pengfei and Xu, Puxin and Iyer, Srinivasan and Sun, Jiao and Mao, Yuning and Ma, Xuezhe and Efrat, Avia and Yu, Ping and Yu, Lili and others},
|
||||
journal={Advances in Neural Information Processing Systems},
|
||||
volume={36},
|
||||
year={2024}
|
||||
}
|
||||
@article{wei2021finetuned,
|
||||
title={Finetuned language models are zero-shot learners},
|
||||
author={Wei, Jason and Bosma, Maarten and Zhao, Vincent Y and Guu, Kelvin and Yu, Adams Wei and Lester, Brian and Du, Nan and Dai, Andrew M and Le, Quoc V},
|
||||
journal={arXiv preprint arXiv:2109.01652},
|
||||
year={2021}
|
||||
}
|
||||
|
||||
@article{brynjolfsson2025generative,
|
||||
title={Generative AI at work},
|
||||
author={Brynjolfsson, Erik and Li, Danielle and Raymond, Lindsey},
|
||||
journal={The Quarterly Journal of Economics},
|
||||
pages={qjae044},
|
||||
year={2025},
|
||||
publisher={Oxford University Press}
|
||||
}
|
||||
@Misc{peft,
|
||||
title = {PEFT: State-of-the-art Parameter-Efficient Fine-Tuning methods},
|
||||
author = {Sourab Mangrulkar and Sylvain Gugger and Lysandre Debut and Younes Belkada and Sayak Paul and Benjamin Bossan},
|
||||
howpublished = {\url{https://github.com/huggingface/peft}},
|
||||
year = {2022}
|
||||
}
|
||||
@article{li2023chatdoctor,
|
||||
title={ChatDoctor: A Medical Chat Model Fine-Tuned on a Large Language Model Meta-AI (LLaMA) Using Medical Domain Knowledge},
|
||||
author={Li, Yunxiang and Li, Zihan and Zhang, Kai and Dan, Ruilong and Jiang, Steve and Zhang, You},
|
||||
journal={Cureus},
|
||||
volume={15},
|
||||
number={6},
|
||||
year={2023},
|
||||
publisher={Cureus}
|
||||
}
|
||||
@online{DatabricksBlog2023DollyV2,
|
||||
author = {Mike Conover and Matt Hayes and Ankit Mathur and Jianwei Xie and Jun Wan and Sam Shah and Ali Ghodsi and Patrick Wendell and Matei Zaharia and Reynold Xin},
|
||||
title = {Free Dolly: Introducing the World's First Truly Open Instruction-Tuned LLM},
|
||||
year = {2023},
|
||||
url = {https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm},
|
||||
urldate = {2023-06-30}
|
||||
}
|
||||
@inproceedings{nakano2021webgpt,
|
||||
author = {Reiichiro Nakano and Jacob Hilton and Suchir Balaji and Jeff Wu and Long Ouyang and Christina Kim and Christopher Hesse and Shantanu Jain and Vineet Kosaraju and William Saunders and Xu Jiang and Karl Cobbe and Tyna Eloundou and Gretchen Krueger and Kevin Button and Matthew Knight and Benjamin Chess and John Schulman},
|
||||
title = {WebGPT: Browser-assisted question-answering with human feedback},
|
||||
booktitle = {arXiv},
|
||||
year = 2021,
|
||||
}
|
||||
@inproceedings{zhang2023automatic,
|
||||
title={Automatic Chain of Thought Prompting in Large Language Models},
|
||||
author={Zhang, Zhuosheng and Zhang, Aston and Li, Mu and Smola, Alex},
|
||||
booktitle={The Eleventh International Conference on Learning Representations (ICLR 2023)},
|
||||
year={2023}
|
||||
}
|
||||
@article{zhao2024hypermoe,
|
||||
title={HyperMoE: Towards Better Mixture of Experts via Transferring Among Experts},
|
||||
author={Zhao, Hao and Qiu, Zihan and Wu, Huijia and Wang, Zili and He, Zhaofeng and Fu, Jie},
|
||||
journal={arXiv preprint arXiv:2402.12656},
|
||||
year={2024}
|
||||
}
|
||||
816
mypaper/IJCAI2026_MESSA.tex
Normal file
816
mypaper/IJCAI2026_MESSA.tex
Normal file
@@ -0,0 +1,816 @@
|
||||
\title{Multi-Task Shared-Specific Sparse Fine-Tuning for Large Language Models}
|
||||
\begin{document}
|
||||
|
||||
\maketitle
|
||||
\begin{abstract}
|
||||
|
||||
|
||||
Large language models are increasingly required to support multiple downstream tasks under strict parameter budgets, where many PEFT methods introduce auxiliary modules that incur additional overhead.
|
||||
Sparse fine-tuning avoids this by directly applying sparse parameter updates to pretrained weights, without modifying model architectures or introducing inference latency.
|
||||
However, existing sparse fine-tuning methods are mostly designed for single-task settings and lack systematic modeling of structure sharing and budget allocation in multi-task scenarios.
|
||||
To tackle these challenges, we propose MESSA, a multi-task shared-specific sparse fine-tuning framework for large language models.
|
||||
MESSA decomposes task adaptations into globally shared and task-specific sparse deltas, allowing flexible sharing across related tasks.
|
||||
To enforce a unified parameter budget, MESSA adopts a budget-aware soft-to-hard structure learning strategy, where differentiable gates are first optimized to induce structured sparsity and then hardened via a single global pruning step.
|
||||
Extensive experiments on multi-task benchmarks demonstrate that MESSA consistently outperforms existing PEFT baselines under comparable parameter budgets.
|
||||
\end{abstract}
|
||||
|
||||
|
||||
\begin{figure}[t]
|
||||
\centering
|
||||
\includegraphics[width=1\linewidth]{assets/attndiff.png}
|
||||
\caption{Task-dependent activation differences between CodeAlpaca and MedQA across different attention modules and layers. Red indicates higher activation in CodeAlpaca, while blue indicates higher activation in MedQA. This highlights the task-specific and shared adaptation requirements at different layers and modules during multi-task fine-tuning.}
|
||||
\label{fig:attndiff}
|
||||
\end{figure}
|
||||
\section{Introduction}
|
||||
Large language models (LLMs) have emerged as general-purpose backbones for a wide range of real-world applications.
|
||||
In practical deployment scenarios, a single pre-trained model is often required to simultaneously support multiple downstream tasks under strict constraints on storage, training cost, and inference efficiency.
|
||||
These constraints make full-parameter fine-tuning impractical and have driven extensive research on parameter-efficient fine-tuning (PEFT) methods.
|
||||
|
||||
|
||||
|
||||
PEFT methods adapt LLMs by updating only a small subset of parameters while keeping most pre-trained weights frozen~\cite{han2024parameter}.
|
||||
Among these, sparse fine-tuning has recently attracted attention due to its favorable deployment properties~\cite{shiracite}.
|
||||
By directly applying sparse parameter updates to pre-trained weights, sparse fine-tuning avoids introducing additional modules or modifying the model architecture, thereby preventing extra inference latency.
|
||||
In contrast, while LoRA~\cite{hu2021lora} and adapter-based~\cite{houlsby2019parameter} approaches reduce trainable parameters, additional modules introduce extra complexity for deployment and task switching in multi-task settings.
|
||||
|
||||
Despite their success, most existing PEFT and sparse fine-tuning methods are developed primarily for single-task adaptation.
|
||||
When extended to multi-task scenarios, they encounter two fundamental challenges that remain insufficiently explored.
|
||||
\textbf{(1) Task Sharing Challenge:} Existing methods either enforce full sharing of sparse structures across tasks, failing to capture task-specific variations, or learn entirely separate updates for each task, which leads to redundant parameters and inefficient resource allocation.
|
||||
In addition, both strategies fail to model the partial and structured dependencies that commonly exist among tasks.
|
||||
As a result, they struggle to balance cross-task knowledge sharing with the flexibility required for effective task-specific adaptation.
|
||||
\textbf{(2) Resource Allocation Challenge:} Most existing methods allocate adaptation parameters independently for each task, often using uniform budget ratios or manually specified task constraints.
|
||||
However, the lack of a global allocation mechanism prevents shared and task-specific parameters from jointly competing for limited resources.
|
||||
This isolated allocation leads to either underutilization or over-allocation of parameters, ultimately resulting in suboptimal performance.
|
||||
|
||||
|
||||
Empirical observations from multi-task fine-tuning further highlight the heterogeneous adaptation requirements across tasks.
|
||||
As illustrated in Figure~\ref{fig:attndiff}, activation patterns exhibit significant task-dependent differences, indicating that certain layers and modules benefit from shared representations, while others require task-specific modifications.
|
||||
These observations emphasize the key challenge of multi-task sparse fine-tuning: how to allocate limited adaptation capacity across tasks under a unified parameter budget.
|
||||
The challenge extends beyond identifying which parameters to update, and critically involves determining how to balance shared and task-specific adaptations.
|
||||
In this setting, each parameter group faces a discrete structural decision: it may remain frozen, be shared across tasks, or be specialized for a particular task.
|
||||
These decisions are inherently interdependent: allocating more shared parameters can improve cross-task generalization but reduces the budget available for task-specific, whereas excessive task-specific updates bring redundancy and inefficiency.
|
||||
However, existing approaches typically rely on static or heuristic allocation strategies, which lack the flexibility to adaptively balance shared and task-specific structures based on task relationships and training signals.
|
||||
|
||||
To address these challenges, we formulate multi-task sparse fine-tuning as a structure allocation problem under a unified parameter budget.
|
||||
The objective is to allocate sparse adaptation capacity within a fixed backbone, balancing shared knowledge across tasks with task-specific specialization, while adhering to global resource constraints.
|
||||
Based on this, we propose \textbf{MESSA}, \textbf{M}ulti-task \textbf{E}fficient \textbf{S}hared-specific \textbf{S}parse \textbf{A}daptation, a shared-specific sparse fine-tuning framework for multi-task adaptation of LLMs.
|
||||
MESSA decomposes the adaptation for each task into the sum of a globally shared sparse update and a task-specific sparse update, enabling flexible modeling of both common and task-dependent knowledge.
|
||||
To determine how adaptation capacity should be allocated, MESSA introduces a budget-aware soft gating mechanism that induces structured sparsity during training.
|
||||
After learning the soft structure, a one-shot global pruning step is applied to convert the soft gates into a fixed sparse model, ensuring no additional inference overhead while preserving performance.
|
||||
|
||||
By efficiently allocating sparse adaptation capacity across tasks and explicitly modeling shared and task-specific structures, MESSA significantly improves parameter efficiency and multi-task performance compared to existing PEFT methods.
|
||||
Importantly, MESSA does not modify the backbone architecture or introduce auxiliary modules, making it well-suited for practical deployment.
|
||||
The main contributions of this paper are summarized as follows:
|
||||
\begin{itemize}[leftmargin=*, topsep=0pt]
|
||||
\item We propose MESSA, a novel shared-specific sparse fine-tuning framework for multi-task adaptation of LLMs.
|
||||
MESSA explicitly models both cross-task shared knowledge and task-specific adaptation within a unified parameter budget, enabling flexible knowledge sharing across tasks while maintaining task-specific specialization.
|
||||
|
||||
\item We formulate multi-task sparse fine-tuning as a structure allocation problem and introduce a budget-aware soft-to-hard structure learning approach.
|
||||
This approach automatically allocates sparse adaptation capacity via soft gating and produces a deployable, performance-preserving sparse model through one-shot pruning.
|
||||
|
||||
\item Extensive experiments on diverse multi-task benchmarks demonstrate that MESSA outperforms existing PEFT baselines under identical parameter budgets, validating its effectiveness, efficiency, and scalability.
|
||||
\end{itemize}
|
||||
|
||||
|
||||
\begin{figure*}[ht]
|
||||
\centering
|
||||
\includegraphics[width=0.7\linewidth]{assets/model2.pdf}
|
||||
\caption{MESSA framework with shared--specific sparse updates. Sparse structures are learned via budget-aware soft gating and overlap regularization, and hardened through a soft-to-hard training process under a unified parameter budget.}
|
||||
\label{fig:framework}
|
||||
\end{figure*}
|
||||
|
||||
\section{Preliminaries and Problem Setup}
|
||||
\label{sec:pre}
|
||||
|
||||
In this section, we first review sparse parameter-efficient fine-tuning and formally define the general budget-constrained multi-task fine-tuning problem studied in this work.
|
||||
|
||||
\subsection{Sparse Parameter-Efficient Fine-Tuning}
|
||||
|
||||
Parameter-efficient fine-tuning (PEFT) aims to adapt a pre-trained LLM by updating only a small subset of parameters while keeping the backbone frozen.
|
||||
Let $\mathcal{M}$ denote a model with parameters $\mathbf{W}$.
|
||||
Sparse PEFT parameterizes updates as a sparse update $\Delta$, yielding the adapted model for a task $t$:
|
||||
\begin{equation}
|
||||
\mathcal{M}^{(t)} = \mathcal{M} + \Delta^{(t)},
|
||||
\end{equation}
|
||||
where $\Delta^{(t)}$ denotes the sparse task-specific adaptation for task $t$, \ie only a small fraction of its entries are non-zero.
|
||||
|
||||
In practice, sparse updates are typically applied in a structured manner, where parameters are selected and updated at the level of parameter groups (\eg rows or blocks of weight matrices), rather than individual scalar weights.
|
||||
This selective update mechanism allows for efficient fine-tuning with minimal parameter overhead.
|
||||
In contrast to low-rank methods such as LoRA~\cite{hu2021lora} and adapter-based PEFT methods~\cite{houlsby2019parameter}, which introduce additional modules to parameterize task adaptations, sparse PEFT directly modifies existing weights and preserves the original model architecture, avoiding additional inference overhead.
|
||||
|
||||
\subsection{Budget-Constrained Multi-Task Fine-Tuning}
|
||||
|
||||
We consider a multi-task learning setting with $T$ downstream tasks $\{\mathcal{T}_t\}_{t=1}^T$.
|
||||
Under sparse PEFT, each task is adapted by a sparse update, and all task adaptations must jointly satisfy a unified global parameter budget.
|
||||
Formally, we decompose the sparse adaptation for task $t$ into two components:
|
||||
\begin{equation}
|
||||
\Delta^{(t)} = \Delta_{\mathrm{sh}} + \Delta_{\mathrm{sp}}^{(t)},
|
||||
\end{equation}
|
||||
where $\Delta_{\mathrm{sh}}$ is shared sparse update applied across all tasks, and $\Delta_{\mathrm{sp}}^{(t)}$ denotes a task-specific update unique to task $t$.
|
||||
|
||||
|
||||
We assume that sparse updates are organized into structured parameter groups.
|
||||
Let $\mathcal{G}$ denote the collection of all parameter groups, and $s_g$ represent the parameter cost associated with group $g \in \mathcal{G}$.
|
||||
A parameter group is considered active if it is selected for updating in either the shared or task-specific component.
|
||||
The total adaptation cost across all tasks is constrained by a unified budget $B$, formalized as:
|
||||
\begin{equation}
|
||||
\sum_{g \in \mathcal{G}} s_g \cdot \mathbb{I}[g \in \Delta_{\mathrm{sh}}]
|
||||
+ \sum_{t=1}^T \sum_{g \in \mathcal{G}} s_g \cdot \mathbb{I}[g \in \Delta_{\mathrm{sp}}^{(t)}]
|
||||
\le B,
|
||||
\end{equation}
|
||||
where $\mathbb{I}[\cdot]$ is an indicator function that takes the value 1 if the parameter group $g$ is activated in the update and 0 otherwise. Note that task-specific updates are counted separately for each task, as they correspond to distinct parameter tensors even when they operate on the same backbone group.
|
||||
|
||||
The objective of budget-constrained multi-task fine-tuning is to allocate limited adaptation capacity across shared and task-specific updates so that all tasks are effectively adapted while satisfying the global budget constraint.
|
||||
|
||||
|
||||
|
||||
\section{Method}
|
||||
\label{sec:method}
|
||||
|
||||
In this section, we present the MESSA framework, which addresses the challenges of budget-constrained multi-task sparse fine-tuning. We begin by providing an overview of the framework, followed by a detailed description of its key components, and finally present the overall algorithm.
|
||||
\subsection{Framework Overview}
|
||||
|
||||
In multi-task scenarios, existing PEFT methods exhibit two fundamental limitations:
|
||||
(1) ignoring partial yet significant dependencies between tasks, resulting in redundant and inefficient resource allocation.
|
||||
(2) lacking a global mechanism to balance shared and task-specific adaptation under a unified parameter budget, which prevents efficient allocation of adaptation capacity across tasks.
|
||||
|
||||
|
||||
|
||||
To address these challenges, we propose \textsc{MESSA} (Multi-Task Efficient Shared-Specific Sparse Adaptation), a framework that formulates multi-task sparse fine-tuning as a structured allocation problem.
|
||||
The key insight is that parameter groups should be treated as decision units and explicitly assigned to remain frozen, be shared across tasks, or be specialized for individual tasks, while being optimized under a unified global budget constraint.
|
||||
As illustrated in Figure~\ref{fig:framework}, \textsc{MESSA} allocates sparse adaptation capacity by decomposing each task's adaptation into shared and task-specific sparse updates using the proposed Shared-Specific Sparse Representation (SS-Sparse), organized into structured, row-wise parameter groups, thereby modeling both common and task-dependent knowledge.
|
||||
A budget-aware soft gating mechanism guides this allocation, and after learning the soft structure, a one-shot pruning step converts it into a fixed, deployable sparse model.
|
||||
By jointly balancing shared and task-specific adaptations, \textsc{MESSA} improves multi-task performance and parameter efficiency, without introducing additional modules or inference latency, making it well suited for real-world multi-task deployment.
|
||||
|
||||
|
||||
\subsection{Shared-Specific Sparse Representation}
|
||||
\label{sec:ss_sparse}
|
||||
Effective multi-task adaptation requires capturing both cross-task commonality and task-specific specialization.
|
||||
Figure~\ref{fig:attndiff} reveals heterogeneous task-dependent activation patterns, indicating the need for both shared and task-specific adaptation.
|
||||
Motivated by this, we introduce \textbf{Shared-Specific Sparse Representation (SS-Sparse)}, which decomposes each task's adaptation into shared and task-specific sparse components.
|
||||
This representation provides an explicit and structured foundation for modeling task relatedness and specialization.
|
||||
|
||||
\subsubsection{Multi-Task Shared-Specific Delta Decomposition}
|
||||
We model multi-task sparse fine-tuning by decomposing the adaptation for each task into a shared component and a task-specific component. Formally, given a frozen backbone model $\mathcal{M}$ with parameters $\mathbf{W}$, the adapted model for task $t$ is defined by modifying the frozen parameters as
|
||||
\begin{equation}
|
||||
\mathcal{M}^{(t)} = \mathcal{M} + \Delta^{(t)},
|
||||
\label{eq:task_model}
|
||||
\end{equation}
|
||||
where the task-specific adaptation $\Delta^{(t)}$ is decomposed as
|
||||
\begin{equation}
|
||||
\Delta^{(t)} = \Delta_{\mathrm{sh}} + \Delta_{\mathrm{sp}}^{(t)}.
|
||||
\label{eq:ss_decomp}
|
||||
\end{equation}
|
||||
|
||||
Here, $\Delta_{\mathrm{sh}}$ denotes a sparse update shared across tasks, capturing cross-task commonality, while $\Delta_{\mathrm{sp}}^{(t)}$ represents a task-specific update, modeling task-dependent variations.
|
||||
This decomposition explicitly separates shared and specialized adaptation capacity within a single sparse update formulation.
|
||||
|
||||
This decomposition offers two key advantages.
|
||||
First, it allows related tasks to reuse a common set of sparse updates, reducing parameter redundancy and improving parameter efficiency.
|
||||
Second, it preserves sufficient flexibility for task-specific adaptation, avoiding the restrictive assumption of complete sharing.
|
||||
Compared to approaches that enforce either fully shared or fully independent adaptations, the shared-specific decomposition in Eq.~\ref{eq:ss_decomp} provides a more expressive and balanced formulation for multi-task sparse fine-tuning.
|
||||
|
||||
\subsubsection{Row-Wise Structured Parameter Groups}
|
||||
|
||||
To enable structured sparsity and efficient allocation of the shared and task-specific updates, we organize sparse updates into parameter groups.
|
||||
In this work, we adopt a row-wise grouping strategy for linear layers.
|
||||
Specifically, for a linear transformation with weight matrix $\mathbf{W} \in \mathbb{R}^{d_{\text{out}} \times d_{\text{in}}}$, each output row is treated as a distinct parameter group, serving as a basic decision unit.
|
||||
Let $\mathcal{G}$ denote the set of all parameter groups, and let $g \in \mathcal{G}$ index a group corresponding to one output row. The parameter cost of each group is defined as
|
||||
\begin{equation}
|
||||
s_g = d_{\text{in}},
|
||||
\label{eq:row_cost}
|
||||
\end{equation}
|
||||
reflecting the number of parameters associated with that row.
|
||||
|
||||
Row-wise grouping provides a favorable balance between flexibility and structure.
|
||||
Compared to element-wise sparsity, it significantly reduces the number of structural decisions and yields contiguous parameter blocks that are easy to prune and deploy.
|
||||
Compared to coarser groupings such as entire layers, it enables fine-grained allocation of adaptation capacity.
|
||||
Moreover, in Transformer-based models, row-wise groups naturally align with output neurons and attention projections, making them suitable units for selective adaptation.
|
||||
|
||||
\subsubsection{Group-Level Soft Gating}
|
||||
To enable differentiable structural allocation over parameter groups, we associate each parameter group with learnable soft gates.
|
||||
For each group $g \in \mathcal{G}$, we introduce a shared gate $z^{\mathrm{sh}}_g \in (0,1)$ and task-specific gates
|
||||
$z^{\mathrm{sp}}_{g,t} \in (0,1)$, which modulate the contributions of the shared and task-specific components, respectively.
|
||||
Under this group-wise representation, the shared and task-specific sparse updates can be expressed as
|
||||
\begin{equation}
|
||||
\Delta^{(t)} = \sum_{g \in \mathcal{G}} \left(
|
||||
z^{\mathrm{sh}}_g \cdot \Delta^{\mathrm{sh}}_g +
|
||||
z^{\mathrm{sp}}_{g,t} \cdot \Delta^{\mathrm{sp}}_{g,t}
|
||||
\right),
|
||||
\label{eq:gated_delta}
|
||||
\end{equation}
|
||||
where $\Delta^{\mathrm{sh}}_g$ and $\Delta^{\mathrm{sp}}_{g,t}$ denote the parameters associated with group $g$ in the shared and task-specific updates.
|
||||
|
||||
The soft gates serve as continuous allocation weights over parameter groups and act as differentiable proxies for discrete structural decisions.
|
||||
During training, a parameter group can simultaneously participate in both shared and task-specific updates,
|
||||
allowing the model to explore different degrees of sharing across tasks.
|
||||
This design enables gradient-based optimization of both parameter values and structure-related decision variables, and provides a continuous foundation for subsequent structure regularization and soft-to-hard selection.
|
||||
|
||||
\subsubsection{Shared-Specific Overlap Regularization}
|
||||
|
||||
While the shared-specific decomposition provides flexibility, excessive simultaneous activation of both shared and task-specific components may lead to redundant adaptation and unclear structural separation.
|
||||
To mitigate this issue, we introduce a shared-specific overlap regularization that penalizes concurrent activation of shared and task-specific gates. Specifically, we define the overlap regularization term as
|
||||
\begin{equation}
|
||||
\mathcal{L}_{\text{overlap}} = \sum_{t=1}^T \sum_{g \in \mathcal{G}} z^{\mathrm{sh}}_g \cdot z^{\mathrm{sp}}_{g,t},
|
||||
\label{eq:overlap}
|
||||
\end{equation}
|
||||
which assigns a higher penalty when both the shared gate $z^{\mathrm{sh}}_g$ and the task-specific gate $z^{\mathrm{sp}}_{g,t}$ are simultaneously active.
|
||||
This regularization encourages each parameter group toward being primarily assigned to either shared or task-specific adaptation, while preserving flexibility.
|
||||
By promoting clearer structural separation, it reduces redundant updates and improves the efficiency of sparse adaptation under a global budget.
|
||||
|
||||
While the overlap regularization guides soft structural allocation during training, the final sparse structure must satisfy a global budget constraint and be converted into a discrete, deployable form, which we describe next.
|
||||
|
||||
\subsection{Soft-to-Hard Structure Learning}
|
||||
\label{sec:soft_to_hard}
|
||||
|
||||
|
||||
Building on the shared-specific sparse representation introduced in Section~\ref{sec:ss_sparse},
|
||||
we describe how the sparse structure is learned and fixed under a unified budget to produce a deployable sparse model.
|
||||
To reduce the search space, we construct a candidate pool as a small multiple of the target budget based on the row-wise weight norm of the pretrained model, with minor random inclusion for diversity, and adopt a soft-to-hard structure learning strategy that hardens learned soft structural preferences into a fixed sparse structure via one-shot pruning.
|
||||
|
||||
|
||||
|
||||
\paratitle{Warmup Phase.}
|
||||
At the beginning of training, sparse adaptation parameters and structural gates are not yet informative.
|
||||
To avoid unstable allocation decisions, we introduce a warmup phase, applied for an initial period of training, in which the gating variables are frozen
|
||||
and only the sparse adaptation parameters within the candidate pool are optimized.
|
||||
During this phase, training minimizes the task loss:
|
||||
\begin{equation}
|
||||
\mathcal{L}_{\text{warmup}} = \mathcal{L}_{\text{task}}.
|
||||
\label{eq:warm_loss}
|
||||
\end{equation}
|
||||
|
||||
This warmup allows sparse updates to learn meaningful task-related representations, providing a stable initialization for subsequent budget-aware structure learning.
|
||||
|
||||
|
||||
\paratitle{Budget-Aware Soft Learning.}
|
||||
After the warmup phase, we jointly optimize the sparse adaptation parameters and structural gates under a unified budget.
|
||||
At this stage, the soft gates act as continuous allocation variables, enabling differentiable structure learning.
|
||||
To incorporate the budget, we define the expected adaptation cost associated with the soft gates as
|
||||
\begin{equation}
|
||||
\mathcal{C}_{\text{soft}} =
|
||||
\sum_{g \in \mathcal{G}} s_g \cdot z^{\mathrm{sh}}_g
|
||||
+ \sum_{t=1}^T \sum_{g \in \mathcal{G}} s_g \cdot z^{\mathrm{sp}}_{g,t},
|
||||
\label{eq:soft_cost}
|
||||
\end{equation}
|
||||
where $s_g$ denotes the parameter cost of group $g$ defined in Eq.~\ref{eq:row_cost}.
|
||||
This soft cost represents the expected number of activated parameters under the soft gates,
|
||||
and serves as a differentiable approximation to the discrete budget constraint.
|
||||
|
||||
We enforce the budget by penalizing violations of the target budget $B$ through a regularization term:
|
||||
\begin{equation}
|
||||
\mathcal{L}_{\text{budget}} =
|
||||
\max \left( 0,\ \mathcal{C}_{\text{soft}} - B \right),
|
||||
\label{eq:budget_loss}
|
||||
\end{equation}
|
||||
which softly discourages the expected adaptation cost from exceeding the target budget.
|
||||
During this phase, the overall training objective is given by
|
||||
\begin{equation}
|
||||
\mathcal{L}_{\text{soft}} =
|
||||
\mathcal{L}_{\text{task}}
|
||||
+ \mathcal{L}_{\text{budget}}
|
||||
+ \lambda_{\text{overlap}} \mathcal{L}_{\text{overlap}},
|
||||
\label{eq:soft_objective}
|
||||
\end{equation}
|
||||
where $\mathcal{L}_{\text{overlap}}$ is the shared-specific overlap regularization defined in Eq.~\ref{eq:overlap}.
|
||||
This objective jointly balances task performance, structural sparsity, and shared-specific separation.
|
||||
The resulting soft structural preferences provide the basis for deriving a discrete,
|
||||
budget-satisfying sparse structure.
|
||||
|
||||
\paratitle{One-Shot Hard Selection.}
|
||||
After budget-aware soft learning, we convert the learned soft structure into a fixed and deployable sparse structure
|
||||
via a \emph{one-shot hard selection} procedure, in which discrete structural decisions are made once.
|
||||
Specifically, parameter groups are ranked according to their learned gate values
|
||||
(\ie $z^{\mathrm{sh}}_g$ for shared updates and $z^{\mathrm{sp}}_{g,t}$ for task-specific ones),
|
||||
and groups with higher scores are selected first until the global budget constraint is satisfied.
|
||||
|
||||
All non-selected groups are pruned by setting their updates to zero, while the selected sparse updates are fixed for inference.
|
||||
As a result, the final model has a fixed sparse structure and introduces no additional overhead at inference time.
|
||||
|
||||
\subsection{Overall Algorithm}
|
||||
\label{sec:algorithm}
|
||||
|
||||
We summarize the overall training procedure of MESSA, which jointly integrates
|
||||
Shared-Specific Sparse Representation (SS-Sparse) and soft-to-hard structure learning
|
||||
to allocate sparse adaptation capacity across multiple tasks under a global budget
|
||||
and produce a fixed, deployable sparse model.
|
||||
|
||||
|
||||
Algorithm~\ref{alg:messa} summarizes the overall training procedure of \textsc{MESSA}.
|
||||
The method follows a soft-to-hard structure learning paradigm for budget-constrained multi-task adaptation.
|
||||
Specifically, \textsc{MESSA} starts with a warmup stage that stabilizes sparse adaptation parameters while keeping structural gates frozen.
|
||||
It then performs budget-aware soft learning, jointly optimizing sparse parameters and soft gates to induce shared and task-specific structures under the global budget.
|
||||
Finally, a one-shot hard selection step converts the learned soft structure into a discrete sparse structure that strictly satisfies the budget, yielding a fixed and deployable model with no additional inference overhead.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
\begin{algorithm}[t]
|
||||
\small
|
||||
\caption{\textsc{MESSA}: Soft-to-Hard Multi-Task Sparse Fine-Tuning}
|
||||
\label{alg:messa}
|
||||
\KwIn{
|
||||
Frozen backbone model $\mathcal{M}$;
|
||||
tasks $\{\mathcal{T}_t\}_{t=1}^T$;
|
||||
global budget $B$; training steps $S$
|
||||
}
|
||||
\KwOut{
|
||||
Fixed shared sparse update $\Delta_{\mathrm{sh}}$;
|
||||
fixed task-specific sparse updates $\{\Delta_{\mathrm{sp}}^{(t)}\}_{t=1}^T$
|
||||
}
|
||||
|
||||
1. Initialize shared and task-specific sparse updates
|
||||
$\Delta_{\mathrm{sh}}, \Delta_{\mathrm{sp}}^{(t)} \leftarrow \mathbf{0}$ for all $t$\;
|
||||
2. Initialize soft gates for all parameter groups\;
|
||||
3. Construct candidate pool $\mathcal{C}$ based on row-wise weight norm\;
|
||||
4. Set warmup steps $S_{\mathrm{warmup}}$ and pruning step $S_{\mathrm{prune}}$\;
|
||||
|
||||
\For{$s = 1$ \KwTo $S$}{
|
||||
Sample a task $t$ and a mini-batch from $\mathcal{T}_t$\;
|
||||
|
||||
\If{$s \le S_{\mathrm{warmup}}$}{
|
||||
Freeze all soft gates\;
|
||||
Update $\Delta_{\mathrm{sh}}$ and $\Delta_{\mathrm{sp}}^{(t)}$
|
||||
within candidate pool $\mathcal{C}$ using task loss $\mathcal{L}_{\text{task}}$ (Eq.~\ref{eq:warm_loss})\;
|
||||
}
|
||||
\Else{
|
||||
Compute SS-Sparse gated updates using soft gates (Eq.~\ref{eq:gated_delta})\;
|
||||
Optimize sparse updates and soft gates using the budget-aware objective
|
||||
$\mathcal{L}_{\text{soft}}$ (Eq.~\ref{eq:soft_objective})\;
|
||||
}
|
||||
|
||||
\If{$s = S_{\mathrm{prune}}$}{
|
||||
Rank parameter groups by soft gate values\;
|
||||
Select shared and task-specific groups under budget $B$\;
|
||||
Convert soft gates to binary masks and prune unselected groups\;
|
||||
Fix the sparse structure for the remaining training steps\;
|
||||
}
|
||||
}
|
||||
|
||||
\Return{$\Delta_{\mathrm{sh}}, \{\Delta_{\mathrm{sp}}^{(t)}\}_{t=1}^T$}
|
||||
\end{algorithm}
|
||||
|
||||
|
||||
\begin{table*}[t]
|
||||
\small
|
||||
\centering
|
||||
\caption{Overall multi-task performance of different PEFT methods across backbone LLMs under a comparable parameter budget.
|
||||
Avg, Geo, and Worst denote Macro Average, Geometric Mean, and Worst-Task, with bold and underlined values indicating the best and second results, $^{*}$ marking statistically significant improvements over the best baseline ($p<0.05$), and Param (\%) reporting the trainable parameter ratio.}
|
||||
|
||||
\label{tab:exp1}
|
||||
\resizebox{0.95\linewidth}{!}{
|
||||
\renewcommand{\arraystretch}{1.05}
|
||||
\begin{tabular}{l|c|ccc|ccc|ccc}
|
||||
\toprule
|
||||
\multirow{2}{*}{Method}
|
||||
& \multirow{2}{*}{\makecell{Avg.Param \\(\%)}}
|
||||
& \multicolumn{3}{c|}{Qwen 3 4B}
|
||||
& \multicolumn{3}{c|}{LLaMA 3.2 3B}
|
||||
& \multicolumn{3}{c}{Gemma 3 4B} \\
|
||||
\cmidrule(lr){3-5} \cmidrule(lr){6-8} \cmidrule(lr){9-11}
|
||||
&
|
||||
& Avg$\uparrow$ & Geo$\uparrow$ & Worst$\uparrow$
|
||||
& Avg$\uparrow$ & Geo$\uparrow$ & Worst$\uparrow$
|
||||
& Avg$\uparrow$ & Geo$\uparrow$ & Worst$\uparrow$ \\
|
||||
\midrule
|
||||
LoRA (shared)
|
||||
& 2.25
|
||||
& 76.47 & 75.56 & 59.81
|
||||
& 67.05 & 65.99 & 53.03
|
||||
& 71.22 & 69.53 & 50.08 \\
|
||||
|
||||
LoRA (specific)
|
||||
& 2.25
|
||||
& \underline{76.66} & \underline{75.76} & 60.75
|
||||
& 64.70 & 63.29 & 52.75
|
||||
& \underline{71.86} & \underline{70.09} & 49.45 \\
|
||||
|
||||
AdaLoRA (shared)
|
||||
& 2.50
|
||||
& 74.82 & 73.94 & 58.24
|
||||
& 63.02 & 62.10 & 51.18
|
||||
& 65.39 & 62.85 & 42.27 \\
|
||||
|
||||
AdaLoRA (specific)
|
||||
& 2.50
|
||||
& 75.45 & 74.61 & 59.18
|
||||
& 62.94 & 61.99 & 53.03
|
||||
& 66.57 & 64.00 & 43.33 \\
|
||||
|
||||
\midrule
|
||||
SHiRA (shared)
|
||||
& 2.26
|
||||
& 74.60 & 73.51 & 56.99
|
||||
& 70.35 & 69.40 & 53.06
|
||||
& 67.99 & 65.64 & 44.27 \\
|
||||
|
||||
SHiRA (specific)
|
||||
& 2.26
|
||||
& 76.62 & 75.67 & \underline{62.64}
|
||||
& 66.94 & 65.62 & 51.33
|
||||
& 71.26 & 69.52 & \underline{50.86} \\
|
||||
|
||||
\midrule
|
||||
MTLoRA
|
||||
& 2.70
|
||||
& \underline{76.81} & \underline{75.98} & 62.01
|
||||
& \underline{71.95} & \underline{71.29} & \underline{58.08}
|
||||
& 71.60 & 69.84 & 50.24 \\
|
||||
|
||||
MOELoRA
|
||||
& 2.26
|
||||
& 76.07 & 75.27 & 60.91
|
||||
& 70.96 & 70.24 & 55.42
|
||||
& 70.52 & 68.78 & 48.67 \\
|
||||
|
||||
MESSA (ours)
|
||||
& 1.86
|
||||
& \textbf{78.01}$^{*}$ & \textbf{77.18}$^{*}$ & \textbf{62.79}$^{*}$
|
||||
& \textbf{72.96}$^{*}$ & \textbf{72.42}$^{*}$ & \textbf{59.50}$^{*}$
|
||||
& \textbf{72.40}$^{*}$ & \textbf{70.63}$^{*}$ & \textbf{51.33}$^{*}$ \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
}
|
||||
\end{table*}
|
||||
\begin{table}[t]
|
||||
\small
|
||||
\centering
|
||||
\caption{Scalability results of different PEFT methods across Qwen3 backbones with different model sizes.
|
||||
}
|
||||
|
||||
\label{tab:scale}
|
||||
\resizebox{1\linewidth}{!}{
|
||||
\renewcommand{\arraystretch}{1.1}
|
||||
\begin{tabular}{l|cc|cc|cc}
|
||||
\toprule
|
||||
Backbone LLM
|
||||
& \multicolumn{2}{c|}{Qwen 3 0.6B}
|
||||
& \multicolumn{2}{c|}{Qwen 3 1.7B}
|
||||
& \multicolumn{2}{c}{Qwen 3 4B} \\
|
||||
\midrule
|
||||
Metric
|
||||
& Avg$\uparrow$ & Geo$\uparrow$
|
||||
& Avg$\uparrow$ & Geo$\uparrow$
|
||||
& Avg$\uparrow$ & Geo$\uparrow$ \\
|
||||
\midrule
|
||||
LoRA (shared)
|
||||
& 58.97 & 55.91
|
||||
& 69.75 & 68.42
|
||||
& 76.47 & 75.56 \\
|
||||
|
||||
LoRA (specific)
|
||||
& 60.66 & 58.12
|
||||
& 69.67 & 68.30
|
||||
& 76.66 & 75.76 \\
|
||||
|
||||
SHiRA (shared)
|
||||
& 56.59 & 53.21
|
||||
& 68.47 & 66.83
|
||||
& 74.60 & 73.51 \\
|
||||
|
||||
SHiRA (specific)
|
||||
& 60.74 & 57.64
|
||||
& \underline{70.96} & \underline{69.76}
|
||||
& 76.62 & 75.67 \\
|
||||
|
||||
MTLoRA
|
||||
& \underline{61.13} & \underline{58.39}
|
||||
& 70.05 & 68.61
|
||||
& \underline{76.81} & \underline{75.98} \\
|
||||
|
||||
\textbf{MESSA (ours)}
|
||||
& \textbf{61.77} & \textbf{58.65}
|
||||
& \textbf{71.93} & \textbf{70.18}
|
||||
& \textbf{78.01} & \textbf{77.18} \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
}
|
||||
\end{table}
|
||||
|
||||
\begin{table*}[t]
|
||||
\centering
|
||||
\footnotesize
|
||||
\caption{Per-task performance on the five evaluation datasets using the Qwen3-4B backbone.
|
||||
BoolQ, MedQA, and HellaSwag are evaluated by accuracy (Acc), GSM8K is evaluated by exact match (EM),
|
||||
and CodeAlpaca is evaluated by instruction compliance rate (ICR).
|
||||
Avg denotes the macro average across all tasks, while Geo denotes the geometric mean.
|
||||
Higher values indicate better performance.}
|
||||
\label{tab:crosstaskresult}
|
||||
\resizebox{0.8\linewidth}{!}{
|
||||
\renewcommand{\arraystretch}{0.94}
|
||||
\begin{tabular}{l|ccccc|cc}
|
||||
\toprule
|
||||
Dataset
|
||||
& BoolQ
|
||||
& CodeAlpaca
|
||||
& MedQA
|
||||
& GSM8K
|
||||
& HellaSwag
|
||||
& \multirow{2}{*}{Avg}
|
||||
& \multirow{2}{*}{Geo} \\
|
||||
\cmidrule(lr){1-6}
|
||||
Metric
|
||||
& Acc $\uparrow$
|
||||
& ICR $\uparrow$
|
||||
& Acc $\uparrow$
|
||||
& EM $\uparrow$
|
||||
& Acc $\uparrow$
|
||||
&
|
||||
& \\
|
||||
\midrule
|
||||
LoRA (shared)
|
||||
& 86.79
|
||||
& \underline{67.45}
|
||||
& 59.81
|
||||
& 77.27
|
||||
& 91.02
|
||||
& 76.47
|
||||
& 75.56 \\
|
||||
|
||||
LoRA (specific)
|
||||
& \underline{87.89}
|
||||
& 67.40
|
||||
& 60.75
|
||||
& 76.06
|
||||
& 91.20
|
||||
& 76.66
|
||||
& 75.76 \\
|
||||
|
||||
AdaLoRA (shared)
|
||||
& 85.81
|
||||
& 66.55
|
||||
& 58.24
|
||||
& 75.61
|
||||
& 87.89
|
||||
& 74.82
|
||||
& 73.94 \\
|
||||
|
||||
AdaLoRA (specific)
|
||||
& 85.02
|
||||
& 66.75
|
||||
& 59.18
|
||||
& \underline{77.42}
|
||||
& 88.89
|
||||
& 75.45
|
||||
& 74.61 \\
|
||||
|
||||
\midrule
|
||||
SHiRA (shared)
|
||||
& 86.79
|
||||
& 64.65
|
||||
& 56.99
|
||||
& 74.85
|
||||
& 89.70
|
||||
& 74.60
|
||||
& 73.51 \\
|
||||
|
||||
SHiRA (specific)
|
||||
& 87.40
|
||||
& 63.50
|
||||
& \underline{62.64}
|
||||
& 77.73
|
||||
& \underline{91.83}
|
||||
& \underline{76.62}
|
||||
& \underline{75.67} \\
|
||||
|
||||
\midrule
|
||||
MTLoRA
|
||||
& 86.42
|
||||
& 66.35
|
||||
& 62.01
|
||||
& \underline{78.33}
|
||||
& 90.92
|
||||
& \underline{76.81}
|
||||
& \underline{75.98} \\
|
||||
|
||||
MOELoRA
|
||||
& 86.24
|
||||
& \underline{67.65}
|
||||
& 60.91
|
||||
& 75.61
|
||||
& 89.92
|
||||
& 76.07
|
||||
& 75.27 \\
|
||||
|
||||
\textbf{MESSA (ours)}
|
||||
& \textbf{88.07}
|
||||
& \textbf{68.30}
|
||||
& \textbf{62.79}
|
||||
& \textbf{78.33}
|
||||
& \textbf{92.57}
|
||||
& \textbf{78.01}
|
||||
& \textbf{77.18} \\
|
||||
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
}
|
||||
\end{table*}
|
||||
\section{Experiments}
|
||||
To comprehensively evaluate the performance of our proposed MESSA, we conduct extensive experiments guided by the following key research questions (RQs):
|
||||
|
||||
\begin{itemize}[leftmargin=*]
|
||||
\item \textbf{RQ1:} Does MESSA improve multi-task performance with similar global budget compared to strong PEFT baselines?
|
||||
\item \textbf{RQ2:} How does MESSA scale with backbone LLMs of different parameter sizes?
|
||||
\item \textbf{RQ3:} How do different components of MESSA contribute to its effectiveness under a unified budget?
|
||||
\item \textbf{RQ4:} What structural allocation patterns does MESSA learn across attention modules in multi-task adaptation?
|
||||
\end{itemize}
|
||||
|
||||
We first introduce the experimental setup and then systematically address each of the above research questions.
|
||||
|
||||
|
||||
\subsection{Experimental Setup}
|
||||
\paragraph{Datasets.}
|
||||
We evaluate our method on five diverse tasks including BoolQ~\cite{clark2019boolq}, CodeAlpaca~\cite{codealpaca}, MedQA~\cite{jin2020disease}, GSM8K~\cite{cobbe2021gsm8k}, and HellaSwag~\cite{zellers2019hellaswag}, which cover heterogeneous reasoning and generation scenarios for evaluating multi-task adaptation.
|
||||
For each task, we use its standard primary evaluation metric (Accuracy for classification-based reasoning tasks, Exact Match for GSM8K, and Instruction Compliance Rate for CodeAlpaca) and report three aggregated metrics, including Macro Average, Geometric Mean, and Worst-Task, to reflect average performance, balance, and robustness.
|
||||
Further details are provided in the Appendix.
|
||||
|
||||
\paragraph{Backbone Models.}
|
||||
We conduct experiments on multiple pre-trained LLM backbones including \textbf{Qwen 3}~\cite{qwen3technicalreport}, \textbf{LLaMA 3.2}~\cite{grattafiori2024llama}, and \textbf{Gemma 3}~\cite{gemma_2025} to evaluate performance and scalability.
|
||||
|
||||
\paragraph{Baseline Methods.}
|
||||
We compare our method with representative PEFT approaches from three categories.
|
||||
\textbf{Low-rank PEFT} baselines include LoRA~\cite{hu2021lora} and AdaLoRA~\cite{zhang2023adalora}.
|
||||
\textbf{Sparse PEFT} baselines include SHiRA~\cite{shiracite}.
|
||||
For these task-agnostic methods, we evaluate both \emph{task-specific} and \emph{shared} settings, where \emph{task-specific} assigns each task its own individual PEFT module, while \emph{shared} uses a single PEFT module shared across all tasks.
|
||||
In addition, MTLoRA~\cite{agiza2024mtlora} and MOELoRA~\cite{liu2024moe} are included as \textbf{multi-task-oriented PEFT} baselines.
|
||||
All methods are compared under matched parameter budgets for fairness.
|
||||
|
||||
|
||||
\subsubsection{Implementation Details}
|
||||
All experiments are conducted on NVIDIA GeForce RTX 4090 using PyTorch and HuggingFace Transformers.
|
||||
We use an AdamW optimizer with a learning rate of 1e-4.
|
||||
MESSA is applied to attention layers under a $2.5\%$ parameter budget, with gate warmup ratio $5\%$, and pruning at $15\%$ of training. Further details are provided in the Appendix and our code\footnote{\codelink}.
|
||||
\subsection{Overall Multi-Task Performance (RQ1)}
|
||||
\label{sec:rq1}
|
||||
|
||||
We first compare MESSA with strong PEFT baselines under a unified parameter budget to evaluate overall multi-task effectiveness.
|
||||
Table~\ref{tab:exp1} reports the overall multi-task performance across three backbone LLMs under a unified parameter budget.
|
||||
MESSA consistently achieves the best results on all backbones, while using fewer trainable parameters than all baselines.
|
||||
Single-task-oriented PEFT methods, such as LoRA, AdaLoRA, and SHiRA, are not designed for budget-constrained multi-task adaptation.
|
||||
When extended to multi-task settings, they either enforce fully shared adaptations across tasks or allocate independent modules for each task.
|
||||
The former lacks the flexibility to capture task-specific variations, while the latter leads to inefficient parameter usage and suboptimal budget allocation when multiple tasks compete for limited adaptation capacity.
|
||||
While multi-task PEFT approaches such as MTLoRA and MOELoRA explicitly consider multiple tasks through routing or mixture mechanisms, they typically rely on heuristic or task-agnostic parameter allocation and do not model global budget competition at the structural level, preventing shared and task-specific parameters from being jointly optimized under a unified constraint.
|
||||
|
||||
To further understand how these overall gains are distributed across individual tasks, Table~\ref{tab:crosstaskresult} reports the per-task performance on the Qwen 3-4B backbone.
|
||||
MESSA improves performance on all five tasks, suggesting MESSA effectively balances shared and task-specific adaptations, enabling improvements across heterogeneous tasks rather than overfitting to a subset of them.
|
||||
|
||||
\subsection{Scalability across Backbone Sizes (RQ2)}
|
||||
\label{sec:rq2}
|
||||
|
||||
We next examine how MESSA scales with backbone LLMs of different parameter sizes.
|
||||
Table~\ref{tab:scale} reports results on Qwen 3 backbones ranging from 0.6B to 4B parameters.
|
||||
Across all model sizes, MESSA consistently achieves the best overall performance, indicating that its advantages are not limited to a specific model scale.
|
||||
Notably, the performance gains of MESSA remain stable as the backbone size increases, demonstrating that MESSA scales robustly with model size and can effectively exploit different backbones while maintaining parameter efficiency under a unified budget.
|
||||
|
||||
\begin{figure}[t]
|
||||
\centering
|
||||
\includegraphics[width=1\linewidth]{assets/combined_ablation_module.pdf}
|
||||
\caption{(a) Ablation study of MESSA, showing the impact of different components on overall multi-task performance.
|
||||
(b) Selection rates of shared and task-specific updates across attention modules.}
|
||||
\label{fig:analysis}
|
||||
\end{figure}
|
||||
|
||||
\subsection{Ablation and Structural Analysis (RQ3, 4)}
|
||||
\label{sec:analysis}
|
||||
Figure~\ref{fig:analysis}(a) presents an ablation study under the same unified budget.
|
||||
Removing any core component of MESSA leads to a consistent performance drop, indicating that its effectiveness relies on the joint design.
|
||||
In particular, gate warmup and soft-to-hard structure learning are important for discovering stable sparse structures, while overlap control between shared and task-specific updates helps avoid redundant parameter allocation under a global budget.
|
||||
|
||||
|
||||
Figure~\ref{fig:analysis}(b) illustrates the learned structural allocation across attention modules.
|
||||
Shared sparse updates are more frequently selected in the key projection, which can be intuitively attributed to its role in defining task-agnostic attention compatibility and thus serving as a natural target for shared adaptation under a unified budget.
|
||||
|
||||
|
||||
\section{Related Work}
|
||||
|
||||
|
||||
\paragraph{Parameter-Efficient Fine-Tuning (PEFT).}
|
||||
PEFT methods adapt large language models by updating only a small subset of parameters while keeping the backbone weights frozen.
|
||||
Early PEFT methods achieve parameter efficiency by introducing lightweight task-specific adaptation modules, including adapters, continuous prompts, and low-rank reparameterizations such as LoRA~\cite{pfeiffer2020adapterhub,li2021prefix,hu2021lora}.
|
||||
Although effective, these approaches rely on auxiliary components, leading to architectural modifications and additional complexity, especially in multi-task settings.
|
||||
More recently, sparse fine-tuning has emerged as an alternative PEFT paradigm that directly learns sparse updates in the original weight space~\cite{sanh2020movement,ansell2024scaling,shiracite}.
|
||||
Sparse fine-tuning directly updates a small subset of backbone parameters, avoiding auxiliary modules and additional inference overhead.
|
||||
Nevertheless, prior work primarily focuses on single-task settings and lacks explicit mechanisms for budget-aware sparse allocation in multi-task adaptation.
|
||||
|
||||
|
||||
\paragraph{Multi-Task Adaptation for LLMs.}
|
||||
|
||||
Multi-task adaptation enables a single model to support multiple tasks simultaneously.
|
||||
A common strategy extends PEFT methods to multi-task scenarios by introducing task-specific adaptation components, such as fusing adapters across tasks or jointly training multiple lightweight modules~\cite{pfeiffer2020adapterfusion,mao2022unipelt,sheng2023s}.
|
||||
Another line of work incorporates routing or mixture-of-experts mechanisms, where multiple adaptation modules (e.g., LoRA or adapter experts) are dynamically selected or weighted for different tasks or inputs~\cite{agiza2024mtlora,liu2024moe}.
|
||||
While effective at modeling task diversity, these methods rely on auxiliary modules or routing mechanisms and typically allocate adaptation capacity in a per-task or heuristic manner, without explicitly modeling global competition under a unified parameter budget.
|
||||
These limitations motivate a budget-aware multi-task adaptation approach that can jointly optimize shared and task-specific structures without introducing auxiliary modules.
|
||||
|
||||
|
||||
|
||||
|
||||
\section{Conclusion}
|
||||
We propose MESSA, a shared-specific sparse fine-tuning framework for budget-constrained multi-task adaptation of LLMs.
|
||||
MESSA formulates multi-task sparse fine-tuning as a structure allocation problem under a unified parameter budget and addresses it through a shared-specific decomposition coupled with a budget-aware soft-to-hard structure learning strategy.
|
||||
By jointly learning shared and task-specific sparse structures and hardening them into a fixed, deployable model, MESSA achieves strong multi-task performance without modifying the backbone architecture or introducing inference overhead.
|
||||
Extensive experiments across diverse tasks and backbones demonstrate that MESSA consistently outperforms existing PEFT methods under identical parameter budgets, achieving superior performance, better task-balanced performance, and improved robustness.
|
||||
These results highlight the importance of budget-aware structural allocation for effective multi-task adaptation and suggest a promising direction for scalable and deployable sparse fine-tuning of LLMs.
|
||||
|
||||
\appendix
|
||||
\section{Evaluation Protocol}
|
||||
|
||||
In this work, we evaluate multi-task performance using task-specific primary metrics and report three aggregated metrics to reflect overall performance, balance, and robustness across tasks.
|
||||
|
||||
\subsection{Task-Specific Evaluation Metrics}
|
||||
|
||||
For each task, we adopt its standard evaluation metric following prior work:
|
||||
\begin{itemize}
|
||||
\item \textbf{BoolQ~\cite{clark2019boolq}, MedQA~\cite{jin2020disease}, HellaSwag~\cite{zellers2019hellaswag}}: Accuracy (Acc), defined as the proportion of correctly predicted answers.
|
||||
\item \textbf{GSM8K~\cite{cobbe2021gsm8k}}: Exact Match (EM), which measures the percentage of predictions that exactly match the ground-truth numerical answer after normalization.
|
||||
\item \textbf{CodeAlpaca~\cite{codealpaca}}: Instruction Compliance Rate (ICR), which measures the proportion of model outputs that successfully follow the instruction and produce a valid code response according to task-specific compliance rules.
|
||||
\end{itemize}
|
||||
|
||||
All metrics are computed independently for each task on their respective test sets.
|
||||
|
||||
\subsection{Metric Normalization}
|
||||
|
||||
All primary metrics naturally lie in the range $[0,1]$, and therefore no additional rescaling or normalization is applied prior to aggregation.
|
||||
|
||||
\subsection{Aggregated Multi-Task Metrics}
|
||||
|
||||
Let $s_t \in [0,1]$ denote the primary evaluation score for task $t$, and let $T$ be the total number of tasks.
|
||||
|
||||
We report the following three aggregated metrics:
|
||||
\begin{itemize}
|
||||
\item \textbf{Macro Average}:
|
||||
\begin{equation}
|
||||
\text{MacroAvg} = \frac{1}{T} \sum_{t=1}^{T} s_t,
|
||||
\end{equation}
|
||||
which reflects the overall average performance across tasks.
|
||||
|
||||
\item \textbf{Geometric Mean}:
|
||||
\begin{equation}
|
||||
\text{GeoMean} = \exp\left( \frac{1}{T} \sum_{t=1}^{T} \log s_t \right),
|
||||
\end{equation}
|
||||
which emphasizes balanced performance and penalizes large disparities across tasks.
|
||||
|
||||
\item \textbf{Worst-Task Performance}:
|
||||
\begin{equation}
|
||||
\text{Worst} = \min_{t \in \{1,\dots,T\}} s_t,
|
||||
\end{equation}
|
||||
which measures robustness by capturing the weakest-task performance.
|
||||
\end{itemize}
|
||||
Note that aggregated evaluation metrics are used for reporting
|
||||
and are not involved in model selection or early stopping.
|
||||
|
||||
\section{Experimental Setup}
|
||||
|
||||
\subsection{Tasks and Datasets}
|
||||
|
||||
We evaluate all methods on five diverse tasks:
|
||||
BoolQ (reading comprehension),
|
||||
CodeAlpaca (code generation),
|
||||
MedQA (medical question answering),
|
||||
GSM8K (mathematical reasoning),
|
||||
and HellaSwag (commonsense reasoning).
|
||||
A unified prompt format is used within each task, and the maximum sequence length is set to 2000 tokens for all experiments.
|
||||
|
||||
\subsection{Data Splits and Reproducibility}
|
||||
|
||||
For datasets with predefined validation sets (BoolQ and HellaSwag), we split the validation set evenly into development and test subsets.
|
||||
For datasets that only provide a test split (MedQA and GSM8K), we similarly split the test set into development and test subsets with a 1:1 ratio.
|
||||
For CodeAlpaca, which contains only a training split, we partition the data into train, development, and test sets using a 7:2:1 ratio.
|
||||
All dataset splits are created using a fixed random seed (42) to ensure reproducibility.
|
||||
|
||||
\subsection{Task Sampling}
|
||||
|
||||
We adopt an epoch-based mixed task sampling strategy.
|
||||
At each epoch, mini-batches from all tasks are constructed independently and then shuffled into a single global sequence.
|
||||
Each mini-batch contains samples from only one task, enabling task-specific gating,
|
||||
while the randomized batch order ensures balanced multi-task optimization.
|
||||
All training samples are visited exactly once per epoch.
|
||||
|
||||
\subsection{Training Setup}
|
||||
|
||||
We implement all methods using PyTorch and DeepSpeed ZeRO-2 with CPU offloading to reduce GPU memory consumption.
|
||||
All experiments are conducted on NVIDIA GeForce RTX 4090 GPUs with BF16 mixed-precision training.
|
||||
We adopt the AdamW optimizer with $\beta_1=0.9$ and $\beta_2=0.95$, and employ a cosine learning rate schedule with a warmup ratio of 10\%.
|
||||
All models are trained using early stopping based on the validation loss.
|
||||
|
||||
\subsection{Implementation Details}
|
||||
All experiments are implemented in Python 3.12.3 using PyTorch 2.7.0 with CUDA 12.8.
|
||||
We use Hugging Face Transformers 4.51.0 for model loading, PEFT 0.17.0 for baseline implementations, and DeepSpeed 0.18.4 to accelerate training.
|
||||
Additional dependencies include Datasets 4.4.1, Accelerate 1.9.0, and NumPy 2.2.6.
|
||||
|
||||
\subsection{MESSA Configuration}
|
||||
|
||||
For sparse structure learning in MESSA, we set the learning rate to $1\times10^{-4}$ and use an effective batch size of 8, implemented as 2 samples per device with 4 steps of gradient accumulation.
|
||||
MESSA is applied to the attention projection layers (Q, K, V, and O) under a unified parameter budget of 2.5\% relative to the backbone model.
|
||||
The candidate pool factor is set to 1.5.
|
||||
The gate warmup phase occupies the first 5\% of training steps, followed by one-shot hard pruning at 15\% of the total training steps.
|
||||
We enable mutual exclusion between shared and task-specific updates via overlap regularization, allowing up to 15\% overlap during soft structure learning.
|
||||
After pruning, unselected parameter groups are permanently frozen and soft gates are converted into hard binary masks.
|
||||
|
||||
\subsection{Baseline Configurations}
|
||||
|
||||
All baselines are configured to ensure fair comparison under similar parameter budgets.
|
||||
For LoRA and SHiRA, we use rank $r=180$ in the shared setting and $r=36$ in the task-specific setting, as both methods adopt comparable low-rank parameterizations.
|
||||
For AdaLoRA, which adaptively adjusts rank during training, we set the initial rank to $r=100$ (shared) and $r=20$ (task-specific).
|
||||
For MTLoRA, we use $r_{\text{shared}}=r_{\text{task}}=36$ with the \texttt{matrixv2} fusion mode.
|
||||
For MOELoRA, we configure five experts corresponding to the number of tasks, each with rank $r=180$ and task embedding dimension 64.
|
||||
1871
mypaper/KDD2026_AgentCity.bib
Normal file
1871
mypaper/KDD2026_AgentCity.bib
Normal file
File diff suppressed because it is too large
Load Diff
825
mypaper/KDD2026_AgentCity.tex
Normal file
825
mypaper/KDD2026_AgentCity.tex
Normal file
@@ -0,0 +1,825 @@
|
||||
\title[AgentCity: An AI-Maintained Continuous Benchmark for Traffic Prediction]{AgentCity: An AI-Maintained Continuous Benchmark \\ for Traffic Prediction}
|
||||
|
||||
\input{misc}
|
||||
|
||||
|
||||
\section{Introduction}
|
||||
\begin{figure}[t]
|
||||
\centering
|
||||
\includegraphics[width=1.\linewidth]{assets/Agent_Promo_NG.png}
|
||||
\caption{AgentCity: A multi-agent system for continuous traffic prediction benchmarking.}
|
||||
\label{fig:placeholder}
|
||||
\end{figure}
|
||||
|
||||
Traffic prediction is a fundamental component of data-driven intelligent transportation systems, supporting a wide range of applications such as traffic management, route planning, mobility analysis, and urban decision-making.
|
||||
In recent years, advances in deep learning have led to a rapid growth of traffic prediction models, covering diverse tasks including traffic state prediction, trajectory forecasting, travel time estimation, and map matching. These models vary substantially in architectural design, modeling assumptions, data requirements, and evaluation settings.
|
||||
|
||||
Consequently, benchmarks play a critical role in enabling systematic evaluation. By providing standardized datasets, clearly defined tasks, and consistent evaluation protocols, benchmarks allow fair and reproducible comparison of model performance across studies, and support empirical analysis within the research community. To this end, several benchmarking frameworks have been proposed for traffic and spatiotemporal prediction. Representative examples include DL-Traff~\cite{Dl-traff}, LibCity~\cite{Libcity}, and TorchSpatial~\cite{Torchspatial}, which aim to standardize data preprocessing, task definitions, and evaluation pipelines across a range of prediction tasks. These efforts establish a more consistent basis for empirical comparison.
|
||||
|
||||
However, existing benchmarks share a fundamental limitation: they rely on \textbf{manual, human-centered maintenance}, which introduces several structural challenges.
|
||||
|
||||
First, \textbf{\emph{limited scalability}} constrains benchmark coverage. The traffic prediction literature continues to expand rapidly, with a large number of new models published each year. These models are implemented using diverse frameworks, code structures, and data interfaces, making their manual integration into a unified benchmark labor-intensive and difficult to sustain at scale. As a result, benchmark coverage often lags behind recent research progress.
|
||||
|
||||
Second, \textbf{\emph{static evaluation pipelines}} limit continuous assessment. Most existing benchmarks are built upon fixed datasets and evaluation procedures, whereas real-world transportation systems evolve continuously, with changes in road networks, travel demand, and mobility patterns. Although some datasets are periodically updated, incorporating these updates into existing benchmarks typically requires additional manual effort, limiting long-term and continuous evaluation.
|
||||
|
||||
Third, \textbf{\emph{inconsistent evaluation settings}} weaken result comparability. Results reported in original papers are often obtained under carefully tuned configurations tailored to specific datasets and tasks, while benchmark implementations typically rely on default or minimally tuned settings. This difference can lead to deviations from reported results and reduces the benchmark’s reliability as a fair reference for model assessment.
|
||||
|
||||
|
||||
Together, these challenges indicate that a key limitation of traffic prediction benchmarking is no longer the absence of standardized frameworks, but the lack of a \emph{continuous}, \emph{scalable}, and \emph{consistently evaluated} maintenance mechanism that treats benchmark construction as an ongoing process rather than a one-time effort.
|
||||
|
||||
In this work, we propose \textbf{AgentCity}, an \textbf{AI-maintained} framework for the continuous construction and evaluation of traffic prediction benchmarks.
|
||||
AgentCity replaces manual, human-centered benchmark maintenance with an automated pipeline that systematically retrieves recent literature, integrates external model and dataset implementations, and evaluates models under unified and consistent protocols.
|
||||
|
||||
|
||||
AgentCity structures benchmark maintenance as a coordinated workflow consisting of three core components: \emph{literature retrieval}, \emph{model and data integration}, and \emph{standardized evaluation}.
|
||||
These components respectively support the automated discovery of relevant studies, the reproduction and integration of external models and datasets into a unified evaluation framework, and the fair assessment of models under consistent data processing, training, and evaluation settings.
|
||||
Within this process, controlled hyperparameter tuning is applied to each model on each task under a unified protocol, ensuring fair and comparable evaluation.
|
||||
The overall workflow is coordinated by a multi-agent system, enabling scalable and robust benchmark maintenance over time.
|
||||
|
||||
|
||||
|
||||
Built upon AgentCity, we construct a continuously evolving traffic prediction benchmark that currently aggregates 74 representative models across multiple tasks and datasets.
|
||||
All models are evaluated using unified evaluation protocols, enabling reproducible and comparable assessment across methods.
|
||||
The AgentCity framework and benchmark are publicly available online, with configurations and evaluation results for reproducibility.
|
||||
|
||||
|
||||
Our main contributions are summarized as follows:
|
||||
\begin{itemize}[leftmargin=*, topsep=0pt]
|
||||
\item We propose \textbf{AgentCity}, the first \textbf{AI-maintained} framework designed for continuous construction and evaluation of \textbf{traffic prediction benchmarks}.
|
||||
\item We develop a multi-agent workflow that automates key benchmark maintenance processes, including literature retrieval, model and data integration, and standardized evaluation.
|
||||
\item We release a large-scale, continuously updated traffic prediction benchmark and public leaderboard built upon AgentCity, supporting reproducible evaluation across tasks and datasets.
|
||||
\end{itemize}
|
||||
|
||||
|
||||
\begin{table*}[ht]
|
||||
\centering
|
||||
\caption{Categorization of traffic-related data and their typical representations.}
|
||||
\label{tab:st_data_abstraction}
|
||||
\resizebox{0.9\linewidth}{!}{
|
||||
\begin{tabular}{c c c c}
|
||||
\toprule
|
||||
\textbf{Data Group} &
|
||||
\textbf{Data Category} &
|
||||
\textbf{Description} &
|
||||
\textbf{Typical Data Form} \\
|
||||
\midrule
|
||||
\multirow{2}{*}{Static Spatial Structure}
|
||||
& Geographical Units &
|
||||
Geographical entities defining the spatial domain. &
|
||||
$N \times D$ \\
|
||||
& Unit Relations &
|
||||
Structured relations between spatial units. &
|
||||
$N \times N$ \\
|
||||
\midrule
|
||||
\multirow{3}{*}{Group-level Spatiotemporal Dynamics}
|
||||
& Unit-level Dynamics &
|
||||
Time-varying attributes defined on spatial units. &
|
||||
$T \times N \times D$ \\
|
||||
& Grid-level Dynamics &
|
||||
Time-varying attributes defined on spatial regions. &
|
||||
$T \times I \times J \times D$ \\
|
||||
& Origin--Destination Dynamics &
|
||||
Time-varying interactions between spatial unit pairs. &
|
||||
$T \times N \times N \times D$ \\
|
||||
\midrule
|
||||
Individual Trajectory Dynamics
|
||||
& Trajectory Data &
|
||||
Ordered temporal sequences of spatial states. &
|
||||
$\{(x_i, t_i)\}_{i=1}^{L}$ \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
}
|
||||
\end{table*}
|
||||
|
||||
|
||||
\begin{table*}[ht]
|
||||
\centering
|
||||
\caption{Categorization of traffic prediction tasks and their input--output data categories.}
|
||||
\label{tab:task_summary}
|
||||
\resizebox{\linewidth}{!}{
|
||||
\begin{tabular}{c c c c}
|
||||
\toprule
|
||||
\textbf{Task} &
|
||||
\textbf{Input Data Category} &
|
||||
\textbf{Output Data Category} &
|
||||
\textbf{Typical Data Form} \\
|
||||
\midrule
|
||||
Traffic State Prediction &
|
||||
Group-level Dynamics \,+\, Unit Relations &
|
||||
Future Unit-level Dynamics &
|
||||
$X \in \mathbb{R}^{T_{\text{in}} \times N \times D},\;
|
||||
y \in \mathbb{R}^{T_{\text{out}} \times N \times D}$ \\
|
||||
\midrule
|
||||
Trajectory Location Prediction &
|
||||
Trajectory Data \,+\, Geographical Units &
|
||||
Next Trajectory Location &
|
||||
$[loc_1, \ldots, loc_n] \rightarrow loc_{n+1}$ \\
|
||||
\midrule
|
||||
ETA Prediction &
|
||||
Trajectory Data &
|
||||
Travel Time &
|
||||
$\{(x_i, t_i)\}_{i=1}^{L} \rightarrow \Delta t$ \\
|
||||
\midrule
|
||||
Map Matching &
|
||||
Trajectory Data \,+\, Geographical Units \,+\, Unit Relations &
|
||||
Road segment sequence &
|
||||
$\{(lon_i, lat_i, t_i)\}_{i=1}^{L} \rightarrow \{r_j\}_{j=1}^{K}$ \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
}
|
||||
\end{table*}
|
||||
|
||||
|
||||
\section{Traffic Prediction Data and Tasks}
|
||||
\label{sec:background}
|
||||
|
||||
This section introduces a unified abstraction of data types and prediction tasks commonly studied in traffic prediction, highlighting the diversity of data organizations and task interfaces that characterize existing traffic prediction benchmarks.
|
||||
|
||||
\label{sec:background}
|
||||
\subsection{Traffic-Related Data Categories}
|
||||
Traffic prediction data differ from homogeneous modalities such as images or text by combining spatial entities, relational structures, and time-indexed observations.
|
||||
In traffic scenarios, these data can be broadly categorized into three groups: static spatial structure, group-level traffic dynamics, and individual trajectory dynamics.
|
||||
|
||||
|
||||
\paratitle{Static Spatial Structure.}
|
||||
Static spatial structure describes the fixed spatial context of a traffic system.
|
||||
It includes geographical units that define the spatial domain, such as sensors, road segments, or regions, as well as structured relations between these units, such as network connectivity or adjacency.
|
||||
This category provides the spatial foundation upon which traffic observations are organized.
|
||||
|
||||
\paratitle{Group-level Traffic Dynamics.}
|
||||
Group-level traffic dynamics capture time-varying attributes defined over spatial units or their relations, including traffic speed, flow, or density measured at sensors or regions.
|
||||
Such data are usually represented as time-indexed tensors defined on nodes, grids, or origin--destination pairs.
|
||||
|
||||
|
||||
\paratitle{Individual Trajectory Dynamics.}
|
||||
Individual trajectory dynamics describe fine-grained mobility behavior of individual trips, represented as spatiotemporal state sequences.
|
||||
|
||||
Table~\ref{tab:st_data_abstraction} summarizes these data categories and their typical representations.
|
||||
Throughout this paper, $N$ denotes the number of spatial units, $T$ the number of time steps, $D$ the feature dimension, $I$ and $J$ the numbers of grid rows and columns, and $L$ the trajectory length.
|
||||
\subsection{Traffic Prediction Tasks}
|
||||
Based on the data categories above, we consider four representative traffic prediction tasks with different data categories and input--output structures, as summarized in Table~\ref{tab:task_summary}.
|
||||
|
||||
|
||||
\paratitle{Traffic state prediction}
|
||||
forecasts future traffic dynamics over a fixed set of spatial units.
|
||||
The input consists of historical group-level dynamics,
|
||||
$X \in \mathbb{R}^{T_{\text{in}} \times N \times D}$,
|
||||
and the output is a sequence of future unit-level dynamics,
|
||||
$y \in \mathbb{R}^{T_{\text{out}} \times N \times D}$.
|
||||
|
||||
\paratitle{Trajectory location prediction}
|
||||
focuses on next-step prediction for individual trajectories.
|
||||
Given a historical trajectory represented as an ordered sequence of locations $[loc_1, \ldots, loc_n]$, the task predicts the next location $loc_{n+1}$.
|
||||
The input trajectories are variable in length, and the outputs are discrete spatial states.
|
||||
|
||||
|
||||
\paratitle{Estimated time of arrival (ETA) prediction} aims to estimate the travel duration of a trajectory.
|
||||
The input is an individual trajectory represented as a sequence of spatiotemporal points
|
||||
$\{(x_i, t_i)\}_{i=1}^{L}$,
|
||||
and the output is a scalar value representing the estimated travel time.
|
||||
|
||||
\paratitle{Map matching}
|
||||
aims to infer the most likely network-constrained path that corresponds to an observed trajectory.
|
||||
Given noisy or sparse trajectory observations, the task outputs an ordered sequence of road segments that is consistent with the underlying network topology.
|
||||
|
||||
\section{Methodology}
|
||||
\subsection{Overview}
|
||||
\label{sec:overview}
|
||||
|
||||
AgentCity is a multi-agent framework designed to support the continuous construction and evaluation of traffic prediction benchmarks.
|
||||
Built on top of LibCity~\cite{Libcity}, AgentCity enables the automated discovery, reproduction, and evaluation of traffic prediction models under unified task definitions and evaluation protocols.
|
||||
Given user-specified keywords and constraints, the system incrementally identifies relevant studies, integrates their models and associated datasets, and evaluates them in a consistent manner.
|
||||
|
||||
As illustrated in Figure~\ref{fig:overview}, AgentCity organizes the overall process into three sequential stages: \emph{Literature Retrieval}, \emph{Model and Data Integration}, and \emph{Standardized Evaluation}.
|
||||
Each stage is managed by a dedicated \emph{Stage Leader Agent}, which is responsible for planning the stage workflow, coordinating specialized \emph{Subagents}, and validating intermediate results.
|
||||
Literature Retrieval focuses on identifying relevant models within a controlled search scope.
|
||||
Model and Data Integration handles the reproduction and adaptation of external model implementations and datasets into unified task interfaces.
|
||||
Standardized Evaluation assesses all integrated models under consistent data processing, training, and evaluation settings.
|
||||
|
||||
To accommodate heterogeneous implementations and incomplete specifications commonly found in research code, AgentCity supports iterative refinement within each stage.
|
||||
When intermediate results do not satisfy predefined validation criteria, the corresponding Stage Leader Agent selectively re-invokes relevant Subagents to refine the outcome, with explicit limits on the number of iterations.
|
||||
|
||||
Artifacts produced at each stage, including structured metadata, configuration files, and validation summaries, are recorded and propagated across stages by a Global Coordinator.
|
||||
This allows subsequent stages to operate based on established information while maintaining a clear separation of responsibilities.
|
||||
Together, these components form a structured workflow that enables scalable and reproducible benchmark construction for traffic prediction.
|
||||
|
||||
\begin{figure*}
|
||||
\centering
|
||||
\includegraphics[width=0.8\linewidth]{agentv2.pdf}
|
||||
\caption{AgentCity framework overview.
|
||||
Benchmark construction is organized into three stages: Literature Retrieval, Model and Data Integration, and Standardized Evaluation.
|
||||
Each stage is coordinated by a Leader Agent that invokes specialized Subagents to perform stage-specific operations.
|
||||
}
|
||||
|
||||
\label{fig:overview}
|
||||
\end{figure*}
|
||||
|
||||
|
||||
|
||||
\subsection{Stage I: Literature Retrieval}
|
||||
\label{sec:literature}
|
||||
The Literature Retrieval stage collects research work related to a given traffic prediction task and produces a structured set of candidate models for downstream integration and evaluation.
|
||||
This stage defines a documented search and filtering procedure and records the resulting candidates and associated metadata.
|
||||
It is managed by a \emph{Retrieval Leader Agent}, which coordinates multiple Subagents to perform concrete operations.
|
||||
|
||||
\paratitle{Paper Searcher.}
|
||||
The Paper Searcher retrieves candidate papers using keyword-based queries derived from user input or a predefined set of task-specific keywords.
|
||||
Additional constraints, such as publication venues or time ranges, can be specified to delimit the search scope.
|
||||
This step collects studies related to the target traffic prediction task across different modeling approaches.
|
||||
|
||||
\paratitle{Paper Evaluator.}
|
||||
The Paper Evaluator examines each retrieved paper to determine whether it provides the information required for subsequent model and data integration.
|
||||
The evaluation checks whether the paper specifies the prediction task, model formulation, input--output definitions, experimental setup, and evaluation metrics.
|
||||
Papers that lack information required for model implementation, data preparation, or evaluation are excluded at this stage.
|
||||
|
||||
\paratitle{Paper Analyzer.}
|
||||
For papers retained after evaluation, the Paper Analyzer extracts information needed for later stages.
|
||||
This includes references to model architectures, code repositories, descriptions of datasets and preprocessing steps, training and evaluation settings, and reported metrics.
|
||||
The extracted information is organized into a structured representation for use in model and data integration.
|
||||
|
||||
\paratitle{Stage execution.}
|
||||
The Retrieval Leader Agent executes the search, evaluation, and analysis steps in sequence.
|
||||
When the resulting paper set does not satisfy predefined criteria, such as coverage of the target task or completeness of extracted metadata, the leader reviews the execution outcomes and re-executes the relevant steps.
|
||||
The output of this stage is a structured collection of candidate models and associated metadata, which is passed to the subsequent integration stage.
|
||||
|
||||
|
||||
|
||||
|
||||
\subsection{Stage II: Model and Data Integration}
|
||||
\label{sec:migration}
|
||||
|
||||
The Model and Data Integration stage reproduces external traffic prediction models together with their associated datasets and aligns them with unified task interfaces for evaluation.
|
||||
This stage transforms heterogeneous research implementations into executable benchmark components that follow consistent data organization, training procedures, and evaluation protocols.
|
||||
It is coordinated by a \emph{Integration Leader Agent}, which manages a set of Subagents responsible for concrete integration steps.
|
||||
|
||||
\paratitle{Source Collector.}
|
||||
The Source Collector retrieves the resources required for reproduction, including model implementations, configuration files, and dataset references extracted in Stage~I.
|
||||
It analyzes the structure of the retrieved codebase to identify model definitions, training pipelines, data loading logic, and external dependencies.
|
||||
The collected sources serve as the basis for subsequent integration.
|
||||
|
||||
\paratitle{Model and Data Adapter.}
|
||||
The Model and Data Adapter performs the core integration work.
|
||||
For models, it aligns architecture definitions, input--output formats, and training interfaces with the benchmark’s task specifications.
|
||||
For datasets, it handles dataset acquisition, preprocessing alignment, feature construction, and data split configuration according to the benchmark protocol.
|
||||
|
||||
\paratitle{Configuration Assembler.}
|
||||
The Configuration Assembler constructs unified configuration files that combine model settings, dataset parameters, and training options.
|
||||
Reported hyperparameters and experimental settings from the original paper are incorporated when available.
|
||||
When details are unspecified, task-consistent defaults defined by the benchmark are applied.
|
||||
The resulting configurations define a complete and executable evaluation setup.
|
||||
|
||||
\paratitle{Integration Validator.}
|
||||
The Integration Validator executes a validation run using the assembled model and dataset configuration.
|
||||
It verifies model initialization, data loading, and basic training execution, and records logs to assess integration completeness.
|
||||
|
||||
|
||||
|
||||
\paratitle{Stage execution.}
|
||||
The Integration Leader Agent executes source collection, adaptation, configuration assembly, and validation in sequence, and re-invokes relevant Subagents when validation criteria are not satisfied.
|
||||
The output of this stage is an executable model--dataset pair together with structured configurations and validation records, which are passed to the evaluation stage.
|
||||
|
||||
|
||||
|
||||
\begin{figure*}
|
||||
\centering
|
||||
\includegraphics[width=1\linewidth]{pie_combined.png}
|
||||
\caption{Distributions of studies included in the benchmark.
|
||||
The figure shows the distribution of collected papers by publication venue (left), publication year (middle), and traffic prediction task (right).}
|
||||
\label{fig:analysis}
|
||||
\end{figure*}
|
||||
|
||||
\subsection{Stage III: Standardized Evaluation}
|
||||
\label{sec:evaluation}
|
||||
|
||||
The Standardized Evaluation stage evaluates integrated traffic prediction models under unified training and evaluation protocols to produce comparable performance results across models.
|
||||
It is coordinated by an \emph{Evaluation Leader Agent}, which oversees a small set of Subagents responsible for execution and result aggregation.
|
||||
|
||||
\paratitle{Evaluation Planner.}
|
||||
The Evaluation Planner specifies the evaluation configuration for each model--task pair, including training settings, evaluation metrics, and the hyperparameter ranges defined by the benchmark protocol.
|
||||
|
||||
\paratitle{Evaluation Executor.}
|
||||
The Evaluation Executor runs model training and evaluation using the specified configurations.
|
||||
During execution, it records performance metrics, training dynamics, and runtime information required for result reporting and analysis.
|
||||
|
||||
\paratitle{Result Collector.}
|
||||
The Result Collector aggregates evaluation outputs across runs, identifies the best-performing configurations according to task-specific metrics, and organizes the results into standardized records for benchmarking.
|
||||
|
||||
\paratitle{Stage execution.}
|
||||
The Evaluation Leader Agent coordinates planning, execution, and result collection, and re-invokes relevant steps when evaluation results are invalid or incomplete.
|
||||
The output of this stage is a set of standardized evaluation results that can be directly compared across models.
|
||||
|
||||
\subsection{Implementation Details}
|
||||
\label{sec:implementation}
|
||||
|
||||
AgentCity is implemented as a coordinated multi-agent system centered around a \emph{Global Coordinator}.
|
||||
The coordinator maintains a shared execution context and dispatches stage-specific \emph{Leader Agents} to execute the three benchmark stages in sequence.
|
||||
Each Leader Agent manages its workflow by invoking Subagents, validating intermediate outputs, and controlling stage execution.
|
||||
|
||||
\paratitle{Agent Coordination and Control.}
|
||||
Leader Agents follow a unified control pattern, decomposing each stage into executable steps, invoking Subagents for concrete operations, and collecting structured outputs.
|
||||
Subagents encapsulate task-specific functions such as literature querying, source acquisition, code adaptation, dataset preparation, model execution, and result aggregation.
|
||||
|
||||
|
||||
\paratitle{Cross-Stage Context Propagation.}
|
||||
The Global Coordinator maintains a shared execution context that records structured artifacts produced at each stage.
|
||||
These artifacts are propagated across stages to support subsequent execution without repeating earlier steps.
|
||||
|
||||
|
||||
\paratitle{Model Backend Configuration.}
|
||||
Different language model backends can be assigned to agents according to task requirements.
|
||||
Code-related and diagnostic tasks use more capable backends, while routine operations may use lighter-weight ones.
|
||||
Backend selection is specified through system configuration and is independent of the overall workflow structure.
|
||||
|
||||
|
||||
\begin{table}[t]
|
||||
\centering
|
||||
\caption{Traffic prediction datasets in AgentCity.}
|
||||
\label{tab:dataset_stats}
|
||||
\resizebox{\linewidth}{!}{
|
||||
\begin{tabular}{l p{7cm}}
|
||||
\toprule
|
||||
\textbf{Task} & \textbf{Dataset} \\
|
||||
\midrule
|
||||
Traffic State Prediction & METR-LA\cite{METR_LA/PEMS_BAY}, PEMSD7(M)\cite{PEMSD7M}, PEMS-BAY\cite{METR_LA/PEMS_BAY}, PEMSD3\cite{PEMSD3/7}, PEMSD4\cite{PEMSD4/8}, PEMSD7\cite{PEMSD3/7}, PEMSD8\cite{PEMSD4/8}, TAXIBJ\cite{TaxiBJ}, T-DRIVE\cite{T-drive}, NYCTaxi\cite{NYCTaxi/Bike}, NYCBike\cite{NYCTaxi/Bike}, LargeST\cite{LargeST} \\
|
||||
Traj. Loc. Prediction & Gowalla\cite{Gowalla/BrightKite}, Foursquare-TKY\cite{Foursquare-NYC/TKY}, Foursquare-NYC\cite{Foursquare-NYC/TKY}, BrightKite\cite{Gowalla/BrightKite}, Instagram\cite{Instagram}, Singapore\cite{Singapore}, Porto\cite{Porto} \\
|
||||
ETA Prediction & Chengdu\cite{Chengdu/DeepTTE}, Beijing\cite{Beijing/TTPNet}, Porto\cite{Porto}, NYCTaxi\cite{NYCTaxi/Bike}, NYCBike\cite{NYCTaxi/Bike} \\
|
||||
Map Matching & Global\cite{Global}(Neftekamsk, Ruzhany, Spaichingen, Valky), Seattle\cite{Seattle} \\
|
||||
\bottomrule
|
||||
\end{tabular}}
|
||||
\end{table}
|
||||
|
||||
\begin{table}[t]
|
||||
\centering
|
||||
\caption{Traffic prediction models in AgentCity.}
|
||||
\label{tab:model_stats}
|
||||
\resizebox{\linewidth}{!}{
|
||||
\begin{tabular}{l p{9cm}}
|
||||
\toprule
|
||||
\textbf{Task} & \textbf{Model} \\
|
||||
\midrule
|
||||
Traffic State Prediction &
|
||||
STSSDL\cite{STSSDL}, STAEformer\cite{STAEformer}, AutoSTF\cite{AutoSTF}, STDMAE\cite{STDMAE},
|
||||
EAC\cite{EAC}, GriddedTNP\cite{GriddedTNP}, PatchSTG\cite{PatchSTG}, SRSNet\cite{SRSNet},
|
||||
FlashST\cite{FlashST}, ConvTimeNet\cite{Convtimenet}, Fredformer\cite{Fredformer}, Pathformer\cite{Pathformer},
|
||||
HTVGNN\cite{HTVGNN}, PatchTST\cite{PatchTST}, DCST\cite{DCST}, STLLM\cite{STLLM},
|
||||
T-graphormer\cite{T-graphormer}, CKGGNN\cite{CKGGNN}, EasyST\cite{EasyST}, LEAF\cite{LEAF},
|
||||
MetaDG\cite{MetaDG}, TRACK\cite{TRACK}, HiMSNet\cite{HiMSNet}, DST2former\cite{DST2former},
|
||||
DSTMamba\cite{DSTMamba}, BigST\cite{BigST}, ASeer\cite{ASeer}, STHSepNet\cite{STHSepNet},
|
||||
STWave\cite{STWave}, HSTWAVE\cite{HSTWAVE}, DSTAGNN\cite{DSTAGNN}, RSTIB\cite{RSTIB},
|
||||
LSTTN\cite{LSTTN}, LightST\cite{LightST}, TimeMixer++\cite{TimeMixer++}, STID\cite{STID}, UniST\cite{UniST} \\
|
||||
Traj. Loc. Pred &
|
||||
DeepMove\cite{DeepMove}, PLMTrajRec\cite{PLMTrajRec}, START\cite{START}, LoTNext\cite{LoTNext},
|
||||
RNTrajRec\cite{RNTrajRec}, CoMaPOI\cite{CoMaPOI}, JGRM\cite{JGRM}, TrajSDE\cite{TrajSDE},
|
||||
DCHL\cite{DCHL}, GNPRSID\cite{GNPRSID}, PLSPL\cite{PLSPL}, GETNext\cite{GETNEXT},
|
||||
CANOE\cite{CANOE}, TPG\cite{TPG}, CLSPRec\cite{CLSPRec}, AGRAN\cite{AGRAN},
|
||||
LightPath\cite{LightPath}, ROTAN\cite{ROTAN}, FPMC\cite{FPMC}, PRME\cite{PRME} \\
|
||||
ETA Prediction &
|
||||
DOT\cite{DOT}, MetaTTE\cite{MetaTTE}, MVSTM\cite{MVSTM}, DutyTTE\cite{DutyTTE},
|
||||
TTPNet\cite{TTPNet}, MTSTAN\cite{MTSTAN}, MulT-TTE\cite{MulT-TTE}, MDTI\cite{MDTI},
|
||||
ProbETA\cite{ProbETA}, HierETA\cite{HierETA}, HetETA\cite{HetETA} \\
|
||||
Map Matching &
|
||||
DeepMM\cite{DeepMM}, GraphMM\cite{GraphMM}, DiffMM\cite{DiffMM}, TRMMA\cite{TRMMA},
|
||||
L2MM\cite{L2MM}, RLOMM\cite{RLOMM}, FMM\cite{FMM}, HMMM\cite{HMMM}, STMatching\cite{STMatching} \\
|
||||
\midrule
|
||||
\end{tabular}}
|
||||
\end{table}
|
||||
|
||||
\begin{table*}[t]
|
||||
\centering
|
||||
\caption{Task-wise datasets, data scale statistics, and evaluation metrics used in the benchmark.
|
||||
$N$, $E$, and $U$ denote the numbers of nodes, edges, and users, respectively.
|
||||
$T$ denotes the total volume of data records, corresponding to the accumulated traffic flow observations for Traffic State Prediction and the total number of trajectory points or check-ins for the other tasks.}
|
||||
|
||||
\label{tab:task_dataset_overview}
|
||||
\resizebox{\linewidth}{!}{
|
||||
\begin{tabular}{l l l l l}
|
||||
\toprule
|
||||
\textbf{Task} & \textbf{Dataset} & \textbf{Scale ($N/E/U/T$)}
|
||||
& \textbf{Time Span} & \textbf{Metrics} \\
|
||||
\midrule
|
||||
|
||||
\multirow{3}{*}{Traffic State Prediction}
|
||||
& METR-LA
|
||||
& $N{=}207$, $E{=}11{,}753$, $T{=}7.1$M
|
||||
& Mar. 2012 -- Jun. 2012
|
||||
& MAE$\downarrow$, RMSE$\downarrow$ \\
|
||||
|
||||
& PEMSD7
|
||||
& $N{=}228$, $E{=}51{,}984$, $T{=}2.9$M
|
||||
& May. 2017 -- Aug. 2017
|
||||
& MAE$\downarrow$, RMSE$\downarrow$ \\
|
||||
|
||||
& PEMS-BAY
|
||||
& $N{=}325$, $E{=}8{,}358$, $T{=}16.9$M
|
||||
& Jan. 2017 -- Jun. 2017
|
||||
& MAE$\downarrow$, RMSE$\downarrow$ \\
|
||||
|
||||
\midrule
|
||||
|
||||
\multirow{3}{*}{Trajectory Location Prediction}
|
||||
& Foursquare\_NYC
|
||||
& $N{=}38{,}332$, $U{=}1{,}082$, $T{=}227$K
|
||||
& Apr. 2012 -- Feb. 2013
|
||||
& Acc@1$\uparrow$, Acc@5$\uparrow$ \\
|
||||
|
||||
& Foursquare\_TKY
|
||||
& $N{=}61{,}857$, $U{=}2{,}292$, $T{=}574$K
|
||||
& Apr. 2012 -- Feb. 2013
|
||||
& Acc@1$\uparrow$, Acc@5$\uparrow$ \\
|
||||
|
||||
& Singapore
|
||||
& $N{=}20{,}153$, $U{=}17{,}744$, $T{=}696$K
|
||||
& Jan. 2017 -- Jun. 2017
|
||||
& Acc@1$\uparrow$, Acc@5$\uparrow$ \\
|
||||
|
||||
\midrule
|
||||
|
||||
\multirow{2}{*}{ETA Prediction}
|
||||
& Beijing
|
||||
& $N{=}16{,}383$, $U{=}76$, $T{=}518$K
|
||||
& Oct. 2013
|
||||
& MAE$\downarrow$, MAPE$\downarrow$, RMSE$\downarrow$ \\
|
||||
|
||||
& Chengdu
|
||||
& $N{=}440{,}056$, $U{=}4{,}565$, $T{=}712$K
|
||||
& Aug. 2014
|
||||
& MAE$\downarrow$, MAPE$\downarrow$, RMSE$\downarrow$ \\
|
||||
|
||||
\midrule
|
||||
|
||||
\multirow{5}{*}{Map Matching}
|
||||
& Neftekamsk
|
||||
& $N{=}18{,}195$, $E{=}41{,}971$, $T{=}2.5$K
|
||||
& 2015
|
||||
& RMF$\downarrow$, AL$\uparrow$ \\
|
||||
|
||||
& Santander
|
||||
& $N{=}24{,}217$, $E{=}48{,}100$, $T{=}653$
|
||||
& 2015
|
||||
& RMF$\downarrow$, AL$\uparrow$ \\
|
||||
|
||||
& Spaichingen
|
||||
& $N{=}4{,}575$, $E{=}9{,}992$, $T{=}517$
|
||||
& 2015
|
||||
& RMF$\downarrow$, AL$\uparrow$ \\
|
||||
|
||||
& Valky
|
||||
& $N{=}1{,}578$, $E{=}3{,}142$, $T{=}1.0$K
|
||||
& 2015
|
||||
& RMF$\downarrow$, AL$\uparrow$ \\
|
||||
|
||||
\bottomrule
|
||||
\end{tabular}}
|
||||
\end{table*}
|
||||
\begin{table}[t]
|
||||
\centering
|
||||
\caption{Traffic state prediction leaderboard on METR\_LA, PEMSD7, and PEMS\_BAY under unified evaluation protocols.}
|
||||
|
||||
\label{tab:traffic_leaderboard}
|
||||
\resizebox{1\linewidth}{!}{
|
||||
\begin{tabular}{l cc cc cc}
|
||||
\toprule
|
||||
\textbf{Model} &
|
||||
\multicolumn{2}{c}{\textbf{METR\_LA}} &
|
||||
\multicolumn{2}{c}{\textbf{PEMSD7}} &
|
||||
\multicolumn{2}{c}{\textbf{PEMS\_BAY}} \\
|
||||
\cmidrule(lr){2-3} \cmidrule(lr){4-5} \cmidrule(lr){6-7}
|
||||
& MAE$\downarrow$ & RMSE$\downarrow$
|
||||
& MAE$\downarrow$ & RMSE$\downarrow$
|
||||
& MAE$\downarrow$ & RMSE$\downarrow$ \\
|
||||
\midrule
|
||||
STAEformer\cite{STAEformer} & 2.962 & 5.984 & 18.96 & 32.28 & 1.532 & 3.446 \\
|
||||
DCST\cite{DCST} & 3.090 & 6.334 & 19.39 & 32.72 & 1.561 & 3.483 \\
|
||||
DST2former\cite{DST2former} & 3.095 & 6.240 & 19.67 & 32.61 & 1.639 & 3.587 \\
|
||||
STDMAE\cite{STDMAE} & 3.096 & 6.230 & 20.19 & 32.99 & 1.579 & 3.502 \\
|
||||
EasyST\cite{EasyST} & 3.115 & 6.419 & 19.49 & 32.48 & 1.565 & 3.509 \\
|
||||
PatchSTG\cite{PatchSTG} & 3.127 & 6.316 & 19.99 & 32.90 & 1.589 & 3.580 \\
|
||||
HiMSNet\cite{HiMSNet} & 3.143 & 6.221 & 23.34 & 36.04 & 1.670 & 3.613 \\
|
||||
STLLM\cite{STLLM} & 3.151 & 6.284 & 20.92 & 33.65 & 1.616 & 3.592 \\
|
||||
LightST\cite{LightST} & 3.167 & 6.372 & 22.00 & 34.59 & 1.607 & 3.580 \\
|
||||
STWave\cite{STWave} & 3.186 & 6.417 & 23.02 & 37.04 & 1.619 & 3.621 \\
|
||||
RSTIB\cite{RSTIB} & 3.194 & 6.606 & 20.37 & 33.40 & 1.610 & 3.666 \\
|
||||
FlashST\cite{FlashST} & 3.203 & 6.511 & 22.40 & 35.47 & 1.636 & 3.645 \\
|
||||
BigST\cite{BigST} & 3.218 & 6.359 & 21.11 & 34.18 & 1.622 & 3.538 \\
|
||||
TRACK\cite{TRACK} & 3.278 & 6.710 & 25.82 & 39.31 & 1.749 & 4.007 \\
|
||||
DSTAGNN\cite{DSTAGNN} & 3.331 & 6.599 & 22.73 & 36.04 & 1.745 & 3.800 \\
|
||||
GriddedTNP\cite{GriddedTNP} & 3.412 & 6.989 & 29.83 & 53.10 & 2.379 & 5.099 \\
|
||||
EAC\cite{EAC} & 3.532 & 6.915 & 26.61 & 40.23 & 1.834 & 4.045 \\
|
||||
AutoSTF\cite{AutoSTF} & 3.977 & 9.406 & 19.72 & 32.56 & 1.544 & 3.446 \\
|
||||
Fredformer\cite{Fredformer} & 4.159 & 9.014 & 24.16 & 38.54 & 1.866 & 4.214 \\
|
||||
ConvTimeNet\cite{Convtimenet} & 4.250 & 9.249 & 29.18 & 45.33 & 2.014 & 4.650 \\
|
||||
LEAF\cite{LEAF} & 4.407 & 9.989 & 28.49 & 43.17 & 1.886 & 4.101 \\
|
||||
SRSNet\cite{SRSNet} & 4.882 & 10.348& 32.12 & 48.80 & 2.163 & 4.923 \\
|
||||
\bottomrule
|
||||
\end{tabular}}
|
||||
\end{table}
|
||||
|
||||
|
||||
\begin{table}[t]
|
||||
\centering
|
||||
\caption{Trajectory location prediction leaderboard on Foursquare\_NYC, Foursquare\_TKY, and Singapore.}
|
||||
|
||||
\label{tab:traj_leaderboard}
|
||||
\resizebox{1\linewidth}{!}{
|
||||
\begin{tabular}{l cc cc cc}
|
||||
\toprule
|
||||
\textbf{Model} &
|
||||
\multicolumn{2}{c}{\textbf{Foursquare\_NYC}} &
|
||||
\multicolumn{2}{c}{\textbf{Foursquare\_TKY}} &
|
||||
\multicolumn{2}{c}{\textbf{Singapore}} \\
|
||||
\cmidrule(lr){2-3} \cmidrule(lr){4-5} \cmidrule(lr){6-7}
|
||||
& Acc@1$\uparrow$ & Acc@5$\uparrow$
|
||||
& Acc@1$\uparrow$ & Acc@5$\uparrow$
|
||||
& Acc@1$\uparrow$ & Acc@5$\uparrow$ \\
|
||||
\midrule
|
||||
ROTAN\cite{ROTAN} & 0.1302 & 0.2805 & 0.1897 & 0.3653 & 0.1631 & 0.3331 \\
|
||||
GNPRSID\cite{GNPRSID} & 0.1591 & 0.3419 & 0.1658 & 0.3746 & 0.1539 & 0.3471 \\
|
||||
RNTrajRec\cite{RNTrajRec} & 0.1605 & 0.3231 & 0.1539 & 0.3305 & 0.1378 & 0.2978 \\
|
||||
DeepMove\cite{DeepMove} & 0.1572 & 0.3739 & 0.1800 & 0.3869 & 0.1298 & 0.3096 \\
|
||||
PLSPL\cite{PLSPL} & 0.1034 & 0.3211 & 0.1732 & 0.3596 & 0.1527 & 0.3294 \\
|
||||
CANOE\cite{CANOE} & 0.1147 & 0.2883 & 0.1535 & 0.3485 & 0.1366 & 0.3089 \\
|
||||
LoTNext\cite{LoTNext} & 0.0856 & 0.2402 & 0.1322 & 0.3890 & 0.1365 & 0.3576 \\
|
||||
DCHL\cite{DCHL} & 0.1009 & 0.3141 & 0.0706 & 0.2507 & 0.0889 & 0.2678 \\
|
||||
\bottomrule
|
||||
\end{tabular}}
|
||||
\end{table}
|
||||
|
||||
|
||||
\begin{table}[t]
|
||||
\centering
|
||||
\caption{ETA prediction leaderboard on Beijing and Chengdu.}
|
||||
\label{tab:eta_leaderboard}
|
||||
\resizebox{\linewidth}{!}{
|
||||
\begin{tabular}{l ccc ccc}
|
||||
\toprule
|
||||
\multirow{2}{*}{\textbf{Model}} &
|
||||
\multicolumn{3}{c}{\textbf{Beijing}} &
|
||||
\multicolumn{3}{c}{\textbf{Chengdu}} \\
|
||||
\cmidrule(lr){2-4} \cmidrule(lr){5-7}
|
||||
& MAE$\downarrow$ & MAPE$\downarrow$ & RMSE$\downarrow$
|
||||
& MAE$\downarrow$ & MAPE$\downarrow$ & RMSE$\downarrow$ \\
|
||||
\midrule
|
||||
HetETA\cite{HetETA} & 125.67 & 0.105 & 222.91 & 190.56 & 0.113 & 308.56 \\
|
||||
DeepTTE\cite{Chengdu/DeepTTE} & 224.46 & 0.208 & 351.74 & 317.38 & 0.220 & 429.09 \\
|
||||
MVSTM\cite{MVSTM} & 279.08 & 0.270 & 430.98 & 255.18 & 0.189 & 343.43 \\
|
||||
MulT-TTE\cite{MulT-TTE} & 280.36 & 0.274 & 432.43 & 465.59 & 0.381 & 580.25 \\
|
||||
DOT\cite{DOT} & 364.85 & 0.382 & 547.62 & 209.74 & 0.163 & 286.02 \\
|
||||
MetaTTE\cite{MetaTTE} & 372.15 & 0.347 & 562.24 & 394.52 & 0.300 & 511.63 \\
|
||||
DutyTTE\cite{DutyTTE} & 431.59 & 0.460 & 572.96 & 243.13 & 0.171 & 443.44 \\
|
||||
\bottomrule
|
||||
\end{tabular}}
|
||||
\end{table}
|
||||
|
||||
|
||||
\begin{table}[t]
|
||||
\centering
|
||||
\caption{Map matching leaderboard on Santander, Spaichingen, Neftekamsk, and Valky.}
|
||||
\label{tab:mm_leaderboard}
|
||||
\resizebox{0.9\linewidth}{!}{
|
||||
\begin{tabular}{l cc cc cc cc}
|
||||
\toprule
|
||||
\multirow{2}{*}{\textbf{Model}} &
|
||||
\multicolumn{2}{c}{\textbf{Santander}} &
|
||||
\multicolumn{2}{c}{\textbf{Spaichingen}} &
|
||||
\multicolumn{2}{c}{\textbf{Neftekamsk}} &
|
||||
\multicolumn{2}{c}{\textbf{Valky}} \\
|
||||
\cmidrule(lr){2-3} \cmidrule(lr){4-5} \cmidrule(lr){6-7} \cmidrule(lr){8-9}
|
||||
& RMF$\downarrow$ & AL$\uparrow$
|
||||
& RMF$\downarrow$ & AL$\uparrow$
|
||||
& RMF$\downarrow$ & AL$\uparrow$
|
||||
& RMF$\downarrow$ & AL$\uparrow$ \\
|
||||
\midrule
|
||||
FMM\cite{FMM} & 0.018 & 1.000 & 0.000 & 1.000 & 0.852 & 0.193 & 0.329 & 0.671 \\
|
||||
HMMM\cite{HMMM} & 0.021 & 0.997 & 0.035 & 1.000 & 0.391 & 0.999 & 0.433 & 1.000 \\
|
||||
STMatching\cite{STMatching} & 0.674 & 0.998 & 0.088 & 1.000 & 0.457 & 1.000 & 0.436 & 1.000 \\
|
||||
DeepMM\cite{DeepMM} & 0.981 & 0.019 & 0.947 & 0.053 & 0.889 & 0.111 & 0.909 & 0.091 \\
|
||||
L2MM\cite{L2MM} & 1.132 & 0.057 & 1.632 & 0.158 & 0.778 & 0.222 & 2.455 & 0.182 \\
|
||||
RLOMM\cite{RLOMM} & 0.920 & 0.280 & 2.760 & 0.240 & 7.440 & 0.120 & 3.000 & 0.600 \\
|
||||
\bottomrule
|
||||
\end{tabular}}
|
||||
\end{table}
|
||||
\section{The AgentCity Benchmark}
|
||||
\label{sec:benchmark_release}
|
||||
|
||||
\subsection{Benchmark Scope and Coverage}
|
||||
\label{sec:benchmark_scope}
|
||||
|
||||
AgentCity supports a unified benchmark that spans multiple traffic prediction tasks and datasets.
|
||||
At the time of writing, the benchmark covers four representative traffic prediction tasks, including traffic state prediction, trajectory location prediction, ETA prediction, and map matching.
|
||||
Across these tasks, AgentCity aggregates a diverse collection of publicly available datasets and model implementations.
|
||||
|
||||
Table~\ref{tab:dataset_stats} summarizes the datasets included in the benchmark.
|
||||
In total, AgentCity covers 26 publicly available datasets across the four traffic prediction tasks.
|
||||
These datasets span heterogeneous spatial representations and temporal resolutions, including graph-based, grid-based, and origin--destination data for traffic state prediction, as well as trajectory datasets represented as variable-length sequences of locations or GPS points.
|
||||
For ETA prediction and map matching, the benchmark includes GPS trajectory datasets with different scales in terms of trajectory volume and network size.
|
||||
|
||||
|
||||
Table~\ref{tab:model_stats} summarizes the traffic prediction models currently included in AgentCity.
|
||||
For each task, the benchmark integrates a representative set of models that follow heterogeneous modeling assumptions and architectural designs.
|
||||
All models are reproduced and evaluated under unified task definitions and evaluation protocols, enabling consistent comparison within and across tasks.
|
||||
|
||||
Across tasks, the benchmark includes datasets defined on sensor networks, region-based spatial partitions, road network graphs, and individual trajectories.
|
||||
Traffic state prediction datasets are typically defined on fixed sensor networks with regular temporal sampling, while trajectory-based datasets represent individual mobility as sequences of locations or GPS points.
|
||||
Map matching datasets are constructed on explicit road networks and focus on network-constrained trajectory inference.
|
||||
Together, these datasets capture both group-level and individual-level traffic dynamics under heterogeneous spatial settings.
|
||||
|
||||
|
||||
\subsection{Literature Coverage Analysis}
|
||||
\label{sec:literature_analysis}
|
||||
|
||||
To characterize the literature coverage of the benchmark, we analyze the distribution of studies included through AgentCity across publication venues, years, and traffic prediction tasks.
|
||||
Figure~\ref{fig:analysis} summarizes these statistics based on the models that have been reproduced and integrated into the benchmark.
|
||||
|
||||
In total, the benchmark includes 74 research papers published in recent years.
|
||||
These papers span multiple traffic prediction tasks, with 36 studies on traffic state prediction, 18 on trajectory location prediction, 11 on estimated time of arrival (ETA) prediction, and 9 on map matching.
|
||||
This task distribution reflects the relative research activity across different traffic prediction problems.
|
||||
|
||||
The venue distribution indicates that many collected studies originate from major data mining and machine learning venues, with KDD representing the largest share.
|
||||
In addition, a notable portion of models are released through arXiv, reflecting active research activity beyond traditional conference venues.
|
||||
|
||||
|
||||
The year distribution indicates that most included studies were published between 2023 and 2025.
|
||||
This concentration reflects the recent growth of research activity in traffic prediction and related areas.
|
||||
These statistics provide a descriptive overview of the literature represented in the benchmark and clarify the scope of models evaluated in AgentCity.
|
||||
|
||||
|
||||
|
||||
\subsection{Task-wise Leaderboards}
|
||||
\label{sec:leaderboards}
|
||||
|
||||
|
||||
This subsection presents representative leaderboard results for four core traffic prediction tasks under unified evaluation protocols.
|
||||
The reported results provide a task-wise view of model performance under consistent data processing, training, and evaluation settings.
|
||||
|
||||
Traffic state prediction results are reported on METR\_LA, PEMSD7, and PEMS\_BAY; trajectory location prediction on Foursquare (NYC, TKY) and Singapore; ETA prediction on Beijing and Chengdu; and map matching on selected cities from the Global dataset.
|
||||
All models are evaluated within a unified framework, with hyperparameters systematically tuned via AgentCity.
|
||||
Training is controlled using early stopping based on validation loss, and the checkpoint with the best validation performance is selected for evaluation.
|
||||
|
||||
Table~\ref{tab:task_dataset_overview} summarizes the datasets used in the reported benchmark results, together with their basic statistics and evaluation protocols.
|
||||
Tables~\ref{tab:traffic_leaderboard}--\ref{tab:mm_leaderboard} present the corresponding task-wise leaderboard results under consistent evaluation settings.
|
||||
|
||||
For clarity and space considerations, we report results on a representative subset of widely used datasets and models for each task, following standard evaluation settings in prior studies.
|
||||
The complete benchmark results, covering additional datasets and model implementations, are available through the online leaderboard.
|
||||
|
||||
\begin{figure}
|
||||
\centering
|
||||
\begin{subfigure}[b]{0.48\linewidth}
|
||||
\hspace{-3px}
|
||||
\includegraphics[width=\linewidth]{figures/Frontend.png}
|
||||
\caption{Benchmark Homepage}
|
||||
\end{subfigure}
|
||||
\hfill
|
||||
\begin{subfigure}[b]{0.48\linewidth}
|
||||
\hspace{-3px}
|
||||
\includegraphics[width=\linewidth]{figures/LeaderBoard.png}
|
||||
\caption{AgentCity Interface}
|
||||
\end{subfigure}
|
||||
\caption{The AgentCity platform.
|
||||
The benchmark homepage presents benchmark statistics and public leaderboards.
|
||||
The AgentCity interface provides an interactive environment for the agent-driven workflow.}
|
||||
|
||||
\label{fig:AgentCity}
|
||||
\end{figure}
|
||||
\subsection{Benchmark Access and Usage}
|
||||
\label{sec:benchmark_access}
|
||||
|
||||
The AgentCity benchmark is publicly accessible.
|
||||
Figure~\ref{fig:AgentCity} presents the project homepage and the AgentCity user interface, which together provide benchmark information, evaluation results, and guidance for executing the benchmark workflow with AgentCity.
|
||||
|
||||
The project homepage introduces the overall scope of AgentCity, including the supported traffic prediction tasks, benchmark organization, and evaluation protocols.
|
||||
It provides documentation for installing and running AgentCity and presents detailed task-wise leaderboards that report benchmark results under unified evaluation settings.
|
||||
The AgentCity user interface allows users to interactively execute the benchmark construction workflow described in this paper.
|
||||
Through the interface, users can run the three stages of literature retrieval, model and data integration, and standardized evaluation, and examine the corresponding outputs.
|
||||
Execution logs, intermediate artifacts, and analysis results from each stage are displayed to support inspection of the benchmark process.
|
||||
|
||||
Detailed usage instructions, task-wise leaderboards, and documentation of the unified evaluation framework are available through the project website and source code repository.\footnote{\fulllink}
|
||||
|
||||
\begin{table}[t]
|
||||
\centering
|
||||
\caption{Comparison between reported results and reproduced results in terms of MAE and RMSE.}
|
||||
\label{tab:mae_rmse_comparison}
|
||||
\resizebox{\linewidth}{!}{%
|
||||
\begin{tabular}{l l cc cc c}
|
||||
\toprule
|
||||
\multirow{2}{*}{\textbf{Model}} & \multirow{2}{*}{\textbf{Dataset}} &
|
||||
\multicolumn{2}{c}{\textbf{Paper Reported}} &
|
||||
\multicolumn{2}{c}{\textbf{Reproduced}} &
|
||||
\multirow{2}{*}{\textbf{Gap (\%)}} \\
|
||||
\cmidrule(lr){3-4} \cmidrule(lr){5-6}
|
||||
& & MAE & RMSE & MAE & RMSE & \\
|
||||
\midrule
|
||||
DSTAGNN & PEMSD4 & 19.30 & 31.46 & 19.90 & 31.29 & 0.85 \\
|
||||
LightST & PEMSD7 & 20.78 & 33.95 & 21.99 & 34.59 & 3.38 \\
|
||||
RSTIB & PEMSD7 & 19.84 & 33.90 & 20.37 & 33.40 & 0.06 \\
|
||||
STDMAE & METR\_LA & 3.00 & 5.98 & 3.09 & 6.23 & 3.79 \\
|
||||
LSTTN & METR\_LA & 2.96 & 5.92 & 3.08 & 6.12 & 3.60 \\
|
||||
AutoSTF & PEMS\_BAY & 1.55 & 3.51 & 1.54 & 3.44 & -1.58 \\
|
||||
DCST & PEMS\_BAY & 1.55 & 3.50 & 1.56 & 3.48 & -0.20 \\
|
||||
\bottomrule
|
||||
\end{tabular}%
|
||||
}
|
||||
\end{table}
|
||||
|
||||
|
||||
\begin{table*}[t]
|
||||
\centering
|
||||
\caption{Comparison of reproduction consistency between AgentCity and other code-oriented agents.}
|
||||
\label{tab:selected_models}
|
||||
\resizebox{0.8\linewidth}{!}{
|
||||
\begin{tabular}{l ccc ccc ccc ccc}
|
||||
\toprule
|
||||
\multirow{2}{*}{\textbf{Source}} &
|
||||
\multicolumn{3}{c}{\textbf{STDMAE(PEMSD7)}} &
|
||||
\multicolumn{3}{c}{\textbf{LightST(PEMSD7)}} &
|
||||
\multicolumn{3}{c}{\textbf{LSTTN(METR\_LA)}} &
|
||||
\multicolumn{3}{c}{\textbf{DSTAGNN(PEMSD4)}} \\
|
||||
\cmidrule(lr){2-4} \cmidrule(lr){5-7} \cmidrule(lr){8-10} \cmidrule(lr){11-13}
|
||||
& \small MAE$\downarrow$ & \small RMSE$\downarrow$ & \small Gap\%$\downarrow$
|
||||
& \small MAE$\downarrow$ & \small RMSE$\downarrow$ & \small Gap\%$\downarrow$
|
||||
& \small MAE$\downarrow$ & \small RMSE$\downarrow$ & \small Gap\%$\downarrow$
|
||||
& \small MAE$\downarrow$ & \small RMSE$\downarrow$ & \small Gap\%$\downarrow$ \\
|
||||
\midrule
|
||||
Reported~(Paper) &
|
||||
18.65 & 31.44 & 0.00 &
|
||||
20.78 & 33.95 & 0.00 &
|
||||
2.96 & 5.92 & 0.00 &
|
||||
19.30 & 31.46 & 0.00 \\
|
||||
SWE-agent &
|
||||
31.96 & 45.87 & 55.38 &
|
||||
22.21 & 34.76 & 4.09 &
|
||||
4.50 & 9.84 & 61.49 &
|
||||
20.11 & 31.48 & 1.64 \\
|
||||
OpenHands &
|
||||
21.79 & 34.55 & 12.48 &
|
||||
26.18 & 38.89 & 18.89 &
|
||||
6.55 & 11.80 & 106.64 &
|
||||
20.27 & 31.97 & 2.91 \\
|
||||
\textbf{AgentCity} &
|
||||
\textbf{20.19} & \textbf{32.99} & \textbf{6.17} &
|
||||
\textbf{21.99} & \textbf{34.59} & \textbf{3.38} &
|
||||
\textbf{3.08} & \textbf{6.12} & \textbf{3.60} &
|
||||
\textbf{19.90} & \textbf{31.29} & \textbf{0.85} \\
|
||||
\bottomrule
|
||||
\end{tabular}}
|
||||
\end{table*}
|
||||
|
||||
|
||||
\section{Benchmark Validation}
|
||||
\label{sec:validation}
|
||||
|
||||
\subsection{Reproduction Fidelity}
|
||||
\label{sec:fidelity}
|
||||
|
||||
We evaluate the reproduction fidelity of AgentCity by comparing reproduced results with the metrics reported in the original papers.
|
||||
This analysis examines whether AgentCity reproduces results that are consistent with those reported in prior studies.
|
||||
|
||||
|
||||
We focus on the traffic state prediction task, which has well-established datasets and evaluation protocols and is commonly used in the literature.
|
||||
Seven representative models are selected for analysis, covering different architectural designs and training strategies.
|
||||
For each model--dataset pair, we report the MAE and RMSE values stated in the original paper together with the corresponding results reproduced by AgentCity.
|
||||
The relative gap between reported and reproduced results is summarized in Table~\ref{tab:mae_rmse_comparison}.
|
||||
|
||||
Across the examined models and datasets, the reproduced results are generally close to the reported values.
|
||||
Differences between reproduced results and reported values can arise from software and hardware environments, nondeterministic training behavior, and minor implementation variations.
|
||||
All results are obtained using a consistent reproduction and evaluation process without manual intervention, indicating that AgentCity reproduces published traffic prediction models with reasonable fidelity.
|
||||
|
||||
\subsection{Reproduction Consistency Across Code Agents}
|
||||
\label{sec:agent_comparison}
|
||||
|
||||
We compare the reproduction results obtained by AgentCity with those produced by two general-purpose code-oriented agents, SWE-agent~\cite{Swe-agent} and OpenHands~\cite{OpenHands}.
|
||||
The comparison examines reproduction consistency, defined as how closely reproduced results match the metrics reported in the original papers.
|
||||
|
||||
All agents are evaluated under the same reproduction setting with Claude-4.5-Opus as the underlying language model, operate on the same code repositories and datasets, and follow the same reproduction objective of matching reported MAE and RMSE values.
|
||||
The prompts used to specify reproduction tasks are identical across agents and are described in Appendix~\ref{Model Adapter}.
|
||||
Each agent is allowed to iteratively execute, debug, and rerun code until a valid training and evaluation pipeline is completed.
|
||||
No manual intervention or task-specific adjustment is performed for any agent during the reproduction process.
|
||||
Table~\ref{tab:selected_models} summarizes the reproduction results.
|
||||
For each model--dataset pair, the table reports the metrics stated in the original paper together with the reproduced MAE, RMSE, and relative gaps.
|
||||
Across the evaluated cases, AgentCity produces reproduced results that are closer to the reported values than those obtained by the other agents under the same reproduction setting.
|
||||
\section{Related Work}
|
||||
|
||||
|
||||
\subsection{Traffic Prediction Benchmarks}
|
||||
|
||||
Benchmark research in traffic prediction has progressed from unified deep learning toolkits toward more diverse evaluation settings.
|
||||
Early benchmarks such as LibCity~\cite{Libcity}, DL-Traff~\cite{Dl-traff}, and TorchSpatial~\cite{Torchspatial} focus on standardizing data processing, task definitions, and evaluation protocols for traffic prediction models, providing a common basis for reproducible comparison of predictive performance.
|
||||
More recent efforts, including CityBench~\cite{CityBench}, STBench~\cite{STBench}, and USTBench~\cite{USTBench}, extend benchmarking beyond predictive accuracy to assess semantic understanding, reasoning, and planning capabilities of general-purpose models in urban and transportation scenarios.
|
||||
Despite this progress, most existing traffic prediction benchmarks are constructed and maintained through largely manual processes.
|
||||
The automation and continuous maintenance of the benchmarking workflow remain insufficiently addressed.
|
||||
|
||||
|
||||
\subsection{LLM Agents for Automated Reproduction and Benchmarking}
|
||||
|
||||
Recent advances in large language model (LLM) agents have enabled tighter coupling between natural language reasoning and automated code generation in scientific workflows.
|
||||
General-purpose frameworks such as SWE-agent~\cite{Swe-agent} and OpenHands~\cite{OpenHands} demonstrate the ability to navigate and modify complex code repositories, while more specialized systems, including ML-Master~\cite{ML-Master} and PiML~\cite{PiML}, focus on automating and optimizing machine learning pipelines.
|
||||
Building on these capabilities, research-oriented agents such as DeepCode~\cite{DeepCode}, Paper2Code~\cite{Paper2code}, and Agent Laboratory~\cite{Agentlaboratory} aim to support broader stages of the scientific process, ranging from algorithm understanding to experiment execution and reproduction~\cite{Autoreproduce}.
|
||||
Despite this progress, most existing LLM-based agents are designed for general-purpose code interaction and research automation.
|
||||
Their workflows do not explicitly account for the domain-specific requirements of traffic and spatiotemporal reproduction, such as heterogeneous data organization, task-specific preprocessing pipelines, and structured spatial representations.
|
||||
|
||||
\section{Conclusion}
|
||||
|
||||
In this work, we present AgentCity, an AI-maintained framework for the continuous construction and evaluation of traffic prediction benchmarks.
|
||||
AgentCity formulates benchmark maintenance as a structured, agent-driven workflow that automates literature retrieval, model and data integration, and standardized evaluation under unified protocols, including systematic hyperparameter tuning, enabling benchmark construction to be treated as an ongoing and scalable process rather than a one-time manual effort.
|
||||
Built on this framework, we release a publicly accessible traffic prediction benchmark that spans multiple representative tasks, integrates diverse datasets and model implementations, and provides task-wise leaderboards under consistent evaluation settings.
|
||||
We further validate the reliability of the framework by comparing reproduced results with those reported in original papers and with results obtained by general-purpose code-oriented agents under the same reproduction settings, demonstrating stable and consistent reproduction performance.
|
||||
AgentCity enables continuous and scalable maintenance of traffic prediction benchmarks under unified evaluation protocols, providing a reproducible basis for integrating and evaluating models as the benchmark evolves.
|
||||
915
mypaper/KDD2026_DyPAM.bib
Normal file
915
mypaper/KDD2026_DyPAM.bib
Normal file
@@ -0,0 +1,915 @@
|
||||
% Related
|
||||
|
||||
@article{lialin2023scaling,
|
||||
title={Scaling down to scale up: A guide to parameter-efficient fine-tuning},
|
||||
author={Lialin, Vladislav and Deshpande, Vijeta and Rumshisky, Anna},
|
||||
journal={arXiv preprint arXiv:2303.15647},
|
||||
year={2023}
|
||||
}
|
||||
% SDCTFT
|
||||
@article{shen2024parameter,
|
||||
title={Parameter-efficient fine-tuning via selective discrete cosine transform},
|
||||
author={Shen, Yixian and Bi, Qi and Huang, Jia-Hong and Zhu, Hongyi and Pathania, Anuj},
|
||||
journal={arXiv preprint arXiv:2410.09103},
|
||||
year={2024}
|
||||
}
|
||||
|
||||
|
||||
% FourierFT
|
||||
@article{gao2024parameter,
|
||||
title={Parameter-efficient fine-tuning with discrete fourier transform},
|
||||
author={Gao, Ziqi and Wang, Qichao and Chen, Aochuan and Liu, Zijing and Wu, Bingzhe and Chen, Liang and Li, Jia},
|
||||
journal={arXiv preprint arXiv:2405.03003},
|
||||
year={2024}
|
||||
}
|
||||
@article{hu2025waveletft,
|
||||
title={WaveletFT: Discrete wavelet transform for parameter-efficient fine-tuning},
|
||||
author={Hu, Can and Yang, Jie and Song, Shien and Fan, Wentao and Xie, Tao},
|
||||
journal={Neurocomputing},
|
||||
pages={130765},
|
||||
year={2025},
|
||||
publisher={Elsevier}
|
||||
}
|
||||
% Little Wavelet
|
||||
@article{bilican2025exploring,
|
||||
title={Exploring Sparsity for Parameter Efficient Fine Tuning Using Wavelets},
|
||||
author={Bilican, Ahmet and Y{\i}lmaz, M Ak{\i}n and Tekalp, A Murat and Cinbi{\c{s}}, R G{\"o}kberk},
|
||||
journal={arXiv preprint arXiv:2505.12532},
|
||||
year={2025}
|
||||
}
|
||||
@article{zhang2025f,
|
||||
title={F-Adapter: Frequency-Adaptive Parameter-Efficient Fine-Tuning in Scientific Machine Learning},
|
||||
author={Zhang, Hangwei and Kang, Chun and Wang, Yan and Zou, Difan},
|
||||
journal={arXiv preprint arXiv:2509.23173},
|
||||
year={2025}
|
||||
}
|
||||
|
||||
|
||||
% LoCA
|
||||
@article{du2025loca,
|
||||
title={LoCA: Location-Aware Cosine Adaptation for Parameter-Efficient Fine-Tuning},
|
||||
author={Du, Zhekai and Min, Yinjie and Li, Jingjing and Lu, Ke and Zou, Changliang and Peng, Liuhua and Chu, Tingjin and Gong, Mingming},
|
||||
journal={arXiv preprint arXiv:2502.06820},
|
||||
year={2025}
|
||||
}
|
||||
% Flylora
|
||||
@article{zou2025flylora,
|
||||
title={FlyloRA: Boosting task decoupling and parameter efficiency via implicit rank-wise mixture-of-experts},
|
||||
author={Zou, Heming and Zang, Yunliang and Xu, Wutong and Zhu, Yao and Ji, Xiangyang},
|
||||
journal={arXiv preprint arXiv:2510.08396},
|
||||
year={2025}
|
||||
}
|
||||
|
||||
|
||||
% LLM
|
||||
@misc{qwen3technicalreport,
|
||||
title={Qwen3 Technical Report},
|
||||
author={Qwen Team},
|
||||
year={2025},
|
||||
eprint={2505.09388},
|
||||
archivePrefix={arXiv},
|
||||
primaryClass={cs.CL},
|
||||
url={https://arxiv.org/abs/2505.09388},
|
||||
}
|
||||
@article{grattafiori2024llama,
|
||||
title={The llama 3 herd of models},
|
||||
author={Grattafiori, Aaron and Dubey, Abhimanyu and Jauhri, Abhinav and Pandey, Abhinav and Kadian, Abhishek and Al-Dahle, Ahmad and Letman, Aiesha and Mathur, Akhil and Schelten, Alan and Vaughan, Alex and others},
|
||||
journal={arXiv preprint arXiv:2407.21783},
|
||||
year={2024}
|
||||
}
|
||||
@article{gemma_2025,
|
||||
title={Gemma 3},
|
||||
url={https://goo.gle/Gemma3Report},
|
||||
publisher={Kaggle},
|
||||
author={Gemma Team},
|
||||
year={2025}
|
||||
}
|
||||
|
||||
@article{voita2019analyzing,
|
||||
title={Analyzing multi-head self-attention: Specialized heads do the heavy lifting, the rest can be pruned},
|
||||
author={Voita, Elena and Talbot, David and Moiseev, Fedor and Sennrich, Rico and Titov, Ivan},
|
||||
journal={arXiv preprint arXiv:1905.09418},
|
||||
year={2019}
|
||||
}
|
||||
|
||||
@article{zhang2022mixture,
|
||||
title={Mixture of attention heads: Selecting attention heads per token},
|
||||
author={Zhang, Xiaofeng and Shen, Yikang and Huang, Zeyu and Zhou, Jie and Rong, Wenge and Xiong, Zhang},
|
||||
journal={arXiv preprint arXiv:2210.05144},
|
||||
year={2022}
|
||||
}
|
||||
|
||||
|
||||
@article{gu2025unpacking,
|
||||
title={Unpacking Positional Encoding in Transformers: A Spectral Analysis of Content-Position Coupling},
|
||||
author={Gu, Zihan and Zhang, Han and Chen, Ruoyu and Hu, Yue and Zhang, Hua},
|
||||
journal={arXiv preprint arXiv:2505.13027},
|
||||
year={2025}
|
||||
}
|
||||
|
||||
@inproceedings{yu2025comrope,
|
||||
title={ComRoPE: Scalable and Robust Rotary Position Embedding Parameterized by Trainable Commuting Angle Matrices},
|
||||
author={Yu, Hao and Jiang, Tangyu and Jia, Shuning and Yan, Shannan and Liu, Shunning and Qian, Haolong and Li, Guanghao and Dong, Shuting and Yuan, Chun},
|
||||
booktitle={Proceedings of the Computer Vision and Pattern Recognition Conference},
|
||||
pages={4508--4517},
|
||||
year={2025}
|
||||
}
|
||||
|
||||
@article{raganato2020fixed,
|
||||
title={Fixed encoder self-attention patterns in transformer-based machine translation},
|
||||
author={Raganato, Alessandro and Scherrer, Yves and Tiedemann, J{\"o}rg},
|
||||
journal={arXiv preprint arXiv:2002.10260},
|
||||
year={2020}
|
||||
}
|
||||
|
||||
|
||||
% IJCAI
|
||||
@article{han2024parameter,
|
||||
title={Parameter-efficient fine-tuning for large models: A comprehensive survey},
|
||||
author={Han, Zeyu and Gao, Chao and Liu, Jinyang and Zhang, Jeff and Zhang, Sai Qian},
|
||||
journal={arXiv preprint arXiv:2403.14608},
|
||||
year={2024}
|
||||
}
|
||||
@article{pan2025rosa,
|
||||
title={RoSA: Enhancing Parameter-Efficient Fine-Tuning via RoPE-aware Selective Adaptation in Large Language Models},
|
||||
author={Pan, Dayan and Wang, Jingyuan and Zhou, Yilong and Cheng, Jiawei and Jia, Pengyue and Zhao, Xiangyu},
|
||||
journal={arXiv preprint arXiv:2511.21733},
|
||||
year={2025}
|
||||
}
|
||||
|
||||
@inproceedings{shiracite,
|
||||
author = {Bhardwaj, Kartikeya and Pandey, Nilesh Prasad and Priyadarshi, Sweta and Ganapathy, Viswanath and Kadambi, Shreya and Esteves, Rafael and Borse, Shubhankar and Whatmough, Paul and Garrepalli, Risheek and Van Baalen, Mart and Teague, Harris and Nagel, Markus},
|
||||
title = {Sparse high rank adapters},
|
||||
year = {2024},
|
||||
isbn = {9798331314385},
|
||||
publisher = {Curran Associates Inc.},
|
||||
address = {Red Hook, NY, USA},
|
||||
booktitle = {Proceedings of the 38th International Conference on Neural Information Processing Systems},
|
||||
articleno = {438},
|
||||
numpages = {31},
|
||||
location = {Vancouver, BC, Canada},
|
||||
series = {NIPS '24}
|
||||
}
|
||||
@article{hu2021lora,
|
||||
title={Lora: Low-rank adaptation of large language models},
|
||||
author={Hu, Edward J and Shen, Yelong and Wallis, Phillip and Allen-Zhu, Zeyuan and Li, Yuanzhi and Wang, Shean and Wang, Lu and Chen, Weizhu},
|
||||
journal={arXiv preprint arXiv:2106.09685},
|
||||
year={2021}
|
||||
}
|
||||
|
||||
% adapter
|
||||
@inproceedings{houlsby2019parameter,
|
||||
title={Parameter-efficient transfer learning for NLP},
|
||||
author={Houlsby, Neil and Giurgiu, Andrei and Jastrzebski, Stanislaw and Morrone, Bruna and De Laroussilhe, Quentin and Gesmundo, Andrea and Attariyan, Mona and Gelly, Sylvain},
|
||||
booktitle={International conference on machine learning},
|
||||
pages={2790--2799},
|
||||
year={2019},
|
||||
organization={PMLR}
|
||||
}
|
||||
|
||||
% AAAING
|
||||
|
||||
% Datasets
|
||||
% GSM8K
|
||||
@article{cobbe2021training,
|
||||
title={Training verifiers to solve math word problems},
|
||||
author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and others},
|
||||
journal={arXiv preprint arXiv:2110.14168},
|
||||
year={2021}
|
||||
}
|
||||
% SVAMP
|
||||
@article{patel2021nlp,
|
||||
title={Are NLP models really able to solve simple math word problems?},
|
||||
author={Patel, Arkil and Bhattamishra, Satwik and Goyal, Navin},
|
||||
journal={arXiv preprint arXiv:2103.07191},
|
||||
year={2021}
|
||||
}
|
||||
% MultiArith
|
||||
@article{roy2016solving,
|
||||
title={Solving general arithmetic word problems},
|
||||
author={Roy, Subhro and Roth, Dan},
|
||||
journal={arXiv preprint arXiv:1608.01413},
|
||||
year={2016}
|
||||
}
|
||||
% Addsub
|
||||
@inproceedings{hosseini2014learning,
|
||||
title={Learning to solve arithmetic word problems with verb categorization},
|
||||
author={Hosseini, Mohammad Javad and Hajishirzi, Hannaneh and Etzioni, Oren and Kushman, Nate},
|
||||
booktitle={Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP)},
|
||||
pages={523--533},
|
||||
year={2014}
|
||||
}
|
||||
% AQuA
|
||||
@article{ling2017program,
|
||||
title={Program induction by rationale generation: Learning to solve and explain algebraic word problems},
|
||||
author={Ling, Wang and Yogatama, Dani and Dyer, Chris and Blunsom, Phil},
|
||||
journal={arXiv preprint arXiv:1705.04146},
|
||||
year={2017}
|
||||
}
|
||||
% SingleEq
|
||||
@article{koncel2015parsing,
|
||||
title={Parsing algebraic word problems into equations},
|
||||
author={Koncel-Kedziorski, Rik and Hajishirzi, Hannaneh and Sabharwal, Ashish and Etzioni, Oren and Ang, Siena Dumas},
|
||||
journal={Transactions of the Association for Computational Linguistics},
|
||||
volume={3},
|
||||
pages={585--597},
|
||||
year={2015},
|
||||
publisher={MIT Press One Rogers Street, Cambridge, MA 02142-1209, USA journals-info~…}
|
||||
}
|
||||
% MAWPS
|
||||
@inproceedings{koncel2016mawps,
|
||||
title={MAWPS: A math word problem repository},
|
||||
author={Koncel-Kedziorski, Rik and Roy, Subhro and Amini, Aida and Kushman, Nate and Hajishirzi, Hannaneh},
|
||||
booktitle={Proceedings of the 2016 conference of the north american chapter of the association for computational linguistics: human language technologies},
|
||||
pages={1152--1157},
|
||||
year={2016}
|
||||
}
|
||||
% BoolQ
|
||||
@article{clark2019boolq,
|
||||
title={Boolq: Exploring the surprising difficulty of natural yes/no questions},
|
||||
author={Clark, Christopher and Lee, Kenton and Chang, Ming-Wei and Kwiatkowski, Tom and Collins, Michael and Toutanova, Kristina},
|
||||
journal={arXiv preprint arXiv:1905.10044},
|
||||
year={2019}
|
||||
}
|
||||
% PIQA
|
||||
@inproceedings{bisk2020piqa,
|
||||
title={Piqa: Reasoning about physical commonsense in natural language},
|
||||
author={Bisk, Yonatan and Zellers, Rowan and Gao, Jianfeng and Choi, Yejin and others},
|
||||
booktitle={Proceedings of the AAAI conference on artificial intelligence},
|
||||
volume={34},
|
||||
number={05},
|
||||
pages={7432--7439},
|
||||
year={2020}
|
||||
}
|
||||
% SIQA
|
||||
@article{sap2019socialiqa,
|
||||
title={Socialiqa: Commonsense reasoning about social interactions},
|
||||
author={Sap, Maarten and Rashkin, Hannah and Chen, Derek and LeBras, Ronan and Choi, Yejin},
|
||||
journal={arXiv preprint arXiv:1904.09728},
|
||||
year={2019}
|
||||
}
|
||||
% HW
|
||||
@article{zellers2019hellaswag,
|
||||
title={Hellaswag: Can a machine really finish your sentence?},
|
||||
author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin},
|
||||
journal={arXiv preprint arXiv:1905.07830},
|
||||
year={2019}
|
||||
}
|
||||
% WN
|
||||
@inproceedings{sakaguchi2020winogrande,
|
||||
title={Winogrande: An adversarial winograd schema challenge at scale},
|
||||
author={Sakaguchi, Keisuke and Le Bras, Ronan and Bhagavatula, Chandra and Choi, Yejin},
|
||||
booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
|
||||
volume={34},
|
||||
number={05},
|
||||
pages={8732--8740},
|
||||
year={2020}
|
||||
}
|
||||
% ARC
|
||||
@article{clark2018think,
|
||||
title={Think you have solved question answering? try arc, the ai2 reasoning challenge},
|
||||
author={Clark, Peter and Cowhey, Isaac and Etzioni, Oren and Khot, Tushar and Sabharwal, Ashish and Schoenick, Carissa and Tafjord, Oyvind},
|
||||
journal={arXiv preprint arXiv:1803.05457},
|
||||
year={2018}
|
||||
}
|
||||
% OBDA
|
||||
@article{mihaylov2018can,
|
||||
title={Can a suit of armor conduct electricity? a new dataset for open book question answering},
|
||||
author={Mihaylov, Todor and Clark, Peter and Khot, Tushar and Sabharwal, Ashish},
|
||||
journal={arXiv preprint arXiv:1809.02789},
|
||||
year={2018}
|
||||
}
|
||||
|
||||
% Related
|
||||
|
||||
@article{li2021prefix,
|
||||
title={Prefix-tuning: Optimizing continuous prompts for generation},
|
||||
author={Li, Xiang Lisa and Liang, Percy},
|
||||
journal={arXiv preprint arXiv:2101.00190},
|
||||
year={2021}
|
||||
}
|
||||
@article{dong2025attention,
|
||||
title={Attention Retrieves, MLP Memorizes: Disentangling Trainable Components in the Transformer},
|
||||
author={Dong, Yihe and Noci, Lorenzo and Khodak, Mikhail and Li, Mufan},
|
||||
journal={arXiv preprint arXiv:2506.01115},
|
||||
year={2025}
|
||||
}
|
||||
@article{michel2019sixteen,
|
||||
title={Are sixteen heads really better than one?},
|
||||
author={Michel, Paul and Levy, Omer and Neubig, Graham},
|
||||
journal={Advances in neural information processing systems},
|
||||
volume={32},
|
||||
year={2019}
|
||||
}
|
||||
@article{belinkov2018evaluating,
|
||||
title={Evaluating layers of representation in neural machine translation on part-of-speech and semantic tagging tasks},
|
||||
author={Belinkov, Yonatan and M{\`a}rquez, Llu{\'\i}s and Sajjad, Hassan and Durrani, Nadir and Dalvi, Fahim and Glass, James},
|
||||
journal={arXiv preprint arXiv:1801.07772},
|
||||
year={2018}
|
||||
}
|
||||
% Others
|
||||
@article{ding2023parameter,
|
||||
title={Parameter-efficient fine-tuning of large-scale pre-trained language models},
|
||||
author={Ding, Ning and Qin, Yujia and Yang, Guang and Wei, Fuchao and Yang, Zonghan and Su, Yusheng and Hu, Shengding and Chen, Yulin and Chan, Chi-Min and Chen, Weize and others},
|
||||
journal={Nature machine intelligence},
|
||||
volume={5},
|
||||
number={3},
|
||||
pages={220--235},
|
||||
year={2023},
|
||||
publisher={Nature Publishing Group UK London}
|
||||
}
|
||||
@article{peng2023instruction,
|
||||
title={Instruction tuning with gpt-4},
|
||||
author={Peng, Baolin and Li, Chunyuan and He, Pengcheng and Galley, Michel and Gao, Jianfeng},
|
||||
journal={arXiv preprint arXiv:2304.03277},
|
||||
year={2023}
|
||||
}
|
||||
|
||||
% Baselines
|
||||
@article{liu2024dora,
|
||||
title={Dora: Weight-decomposed low-rank adaptation},
|
||||
author={Liu, Shih-Yang and Wang, Chien-Yi and Yin, Hongxu and Molchanov, Pavlo and Wang, Yu-Chiang Frank and Cheng, Kwang-Ting and Chen, Min-Hung},
|
||||
journal={arXiv preprint arXiv:2402.09353},
|
||||
year={2024}
|
||||
}
|
||||
|
||||
@article{zhang2023adalora,
|
||||
title={Adalora: Adaptive budget allocation for parameter-efficient fine-tuning},
|
||||
author={Zhang, Qingru and Chen, Minshuo and Bukharin, Alexander and Karampatziakis, Nikos and He, Pengcheng and Cheng, Yu and Chen, Weizhu and Zhao, Tuo},
|
||||
journal={arXiv preprint arXiv:2303.10512},
|
||||
year={2023}
|
||||
}
|
||||
% C3A
|
||||
@article{chen2024parameter,
|
||||
title={Parameter-efficient fine-tuning via circular convolution},
|
||||
author={Chen, Aochuan and Cheng, Jiashun and Liu, Zijing and Gao, Ziqi and Tsung, Fugee and Li, Yu and Li, Jia},
|
||||
journal={arXiv preprint arXiv:2407.19342},
|
||||
year={2024}
|
||||
}
|
||||
% BONE
|
||||
@article{kang2024balancing,
|
||||
title={Balancing LoRA Performance and Efficiency with Simple Shard Sharing},
|
||||
author={Kang, Jiale and Yin, Qingyu},
|
||||
journal={arXiv preprint arXiv:2409.15371},
|
||||
year={2024}
|
||||
}
|
||||
% VERA-EDITED
|
||||
@article{kopiczko2023vera,
|
||||
title={Vera: Vector-based random matrix adaptation},
|
||||
author={{Kopiczko et al.}},
|
||||
journal={arXiv preprint arXiv:2310.11454},
|
||||
year={2023}
|
||||
}
|
||||
% OFT
|
||||
@article{qiu2023controlling,
|
||||
title={Controlling text-to-image diffusion by orthogonal finetuning},
|
||||
author={Qiu, Zeju and Liu, Weiyang and Feng, Haiwen and Xue, Yuxuan and Feng, Yao and Liu, Zhen and Zhang, Dan and Weller, Adrian and Sch{\"o}lkopf, Bernhard},
|
||||
journal={Advances in Neural Information Processing Systems},
|
||||
volume={36},
|
||||
pages={79320--79362},
|
||||
year={2023}
|
||||
}
|
||||
|
||||
|
||||
% BOFT
|
||||
@article{liu2023parameter,
|
||||
title={Parameter-efficient orthogonal finetuning via butterfly factorization},
|
||||
author={Liu, Weiyang and Qiu, Zeju and Feng, Yao and Xiu, Yuliang and Xue, Yuxuan and Yu, Longhui and Feng, Haiwen and Liu, Zhen and Heo, Juyeon and Peng, Songyou and others},
|
||||
journal={arXiv preprint arXiv:2311.06243},
|
||||
year={2023}
|
||||
}
|
||||
% IA3
|
||||
@article{liu2022few,
|
||||
title={Few-shot parameter-efficient fine-tuning is better and cheaper than in-context learning},
|
||||
author={Liu, Haokun and Tam, Derek and Muqeeth, Mohammed and Mohta, Jay and Huang, Tenghao and Bansal, Mohit and Raffel, Colin A},
|
||||
journal={Advances in Neural Information Processing Systems},
|
||||
volume={35},
|
||||
pages={1950--1965},
|
||||
year={2022}
|
||||
}
|
||||
|
||||
% LN-Tuning
|
||||
@article{zhao2023tuning,
|
||||
title={Tuning layernorm in attention: Towards efficient multi-modal llm finetuning},
|
||||
author={Zhao, Bingchen and Tu, Haoqin and Wei, Chen and Mei, Jieru and Xie, Cihang},
|
||||
journal={arXiv preprint arXiv:2312.11420},
|
||||
year={2023}
|
||||
}
|
||||
|
||||
% Deepspeed
|
||||
@inproceedings{rasley2020deepspeed,
|
||||
title={Deepspeed: System optimizations enable training deep learning models with over 100 billion parameters},
|
||||
author={Rasley, Jeff and Rajbhandari, Samyam and Ruwase, Olatunji and He, Yuxiong},
|
||||
booktitle={Proceedings of the 26th ACM SIGKDD international conference on knowledge discovery \& data mining},
|
||||
pages={3505--3506},
|
||||
year={2020}
|
||||
}
|
||||
% Huggingface Transformers
|
||||
@inproceedings{wolf2020transformers,
|
||||
title={Transformers: State-of-the-art natural language processing},
|
||||
author={Wolf, Thomas and Debut, Lysandre and Sanh, Victor and Chaumond, Julien and Delangue, Clement and Moi, Anthony and Cistac, Pierric and Rault, Tim and Louf, Remi and Funtowicz, Morgan and others},
|
||||
booktitle={Proceedings of the 2020 conference on empirical methods in natural language processing: system demonstrations},
|
||||
pages={38--45},
|
||||
year={2020}
|
||||
}
|
||||
|
||||
@inproceedings{geva2021transformer,
|
||||
title={Transformer Feed-Forward Layers Are Key-Value Memories},
|
||||
author={Geva, Mor and Schuster, Roei and Berant, Jonathan and Levy, Omer},
|
||||
booktitle={Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing},
|
||||
pages={5484--5495},
|
||||
year={2021}
|
||||
}
|
||||
@article{meng2022mass,
|
||||
title={Mass-editing memory in a transformer},
|
||||
author={Meng, Kevin and Sharma, Arnab Sen and Andonian, Alex and Belinkov, Yonatan and Bau, David},
|
||||
journal={arXiv preprint arXiv:2210.07229},
|
||||
year={2022}
|
||||
}
|
||||
@article{clark2019does,
|
||||
title={What does bert look at? an analysis of bert's attention},
|
||||
author={Clark, Kevin and Khandelwal, Urvashi and Levy, Omer and Manning, Christopher D},
|
||||
journal={arXiv preprint arXiv:1906.04341},
|
||||
year={2019}
|
||||
}
|
||||
|
||||
@article{su2024roformer,
|
||||
title={Roformer: Enhanced transformer with rotary position embedding},
|
||||
author={Su, Jianlin and Ahmed, Murtadha and Lu, Yu and Pan, Shengfeng and Bo, Wen and Liu, Yunfeng},
|
||||
journal={Neurocomputing},
|
||||
volume={568},
|
||||
pages={127063},
|
||||
year={2024},
|
||||
publisher={Elsevier}
|
||||
}
|
||||
@article{barbero2024round,
|
||||
title={Round and round we go! what makes rotary positional encodings useful?},
|
||||
author={Barbero, Federico and Vitvitskyi, Alex and Perivolaropoulos, Christos and Pascanu, Razvan and Veli{\v{c}}kovi{\'c}, Petar},
|
||||
journal={arXiv preprint arXiv:2410.06205},
|
||||
year={2024}
|
||||
}
|
||||
|
||||
@article{jin2025massive,
|
||||
title={Massive Values in Self-Attention Modules are the Key to Contextual Knowledge Understanding},
|
||||
author={Jin, Mingyu and Mei, Kai and Xu, Wujiang and Sun, Mingjie and Tang, Ruixiang and Du, Mengnan and Liu, Zirui and Zhang, Yongfeng},
|
||||
journal={arXiv preprint arXiv:2502.01563},
|
||||
year={2025}
|
||||
}
|
||||
@article{vaswani2017attention,
|
||||
title={Attention is all you need},
|
||||
author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
|
||||
journal={Advances in neural information processing systems},
|
||||
volume={30},
|
||||
year={2017}
|
||||
}
|
||||
@article{touvron2023llama,
|
||||
title={Llama: Open and efficient foundation language models},
|
||||
author={Touvron, Hugo and Lavril, Thibaut and Izacard, Gautier and Martinet, Xavier and Lachaux, Marie-Anne and Lacroix, Timoth{\'e}e and Rozi{\`e}re, Baptiste and Goyal, Naman and Hambro, Eric and Azhar, Faisal and others},
|
||||
journal={arXiv preprint arXiv:2302.13971},
|
||||
year={2023}
|
||||
}
|
||||
@article{shazeer2020glu,
|
||||
title={Glu variants improve transformer},
|
||||
author={Shazeer, Noam},
|
||||
journal={arXiv preprint arXiv:2002.05202},
|
||||
year={2020}
|
||||
}
|
||||
@inproceedings{he2016deep,
|
||||
title={Deep residual learning for image recognition},
|
||||
author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
|
||||
booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
|
||||
pages={770--778},
|
||||
year={2016}
|
||||
}
|
||||
@article{bai2023qwen,
|
||||
title={Qwen technical report},
|
||||
author={Bai, Jinze and Bai, Shuai and Chu, Yunfei and Cui, Zeyu and Dang, Kai and Deng, Xiaodong and Fan, Yang and Ge, Wenbin and Han, Yu and Huang, Fei and others},
|
||||
journal={arXiv preprint arXiv:2309.16609},
|
||||
year={2023}
|
||||
}
|
||||
|
||||
% SiLU
|
||||
@article{elfwing2018sigmoid,
|
||||
title={Sigmoid-weighted linear units for neural network function approximation in reinforcement learning},
|
||||
author={Elfwing, Stefan and Uchibe, Eiji and Doya, Kenji},
|
||||
journal={Neural networks},
|
||||
volume={107},
|
||||
pages={3--11},
|
||||
year={2018},
|
||||
publisher={Elsevier}
|
||||
}
|
||||
@article{press2021train,
|
||||
title={Train short, test long: Attention with linear biases enables input length extrapolation},
|
||||
author={Press, Ofir and Smith, Noah A and Lewis, Mike},
|
||||
journal={arXiv preprint arXiv:2108.12409},
|
||||
year={2021}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@article{ainslie2023gqa,
|
||||
title={Gqa: Training generalized multi-query transformer models from multi-head checkpoints},
|
||||
author={Ainslie, Joshua and Lee-Thorp, James and De Jong, Michiel and Zemlyanskiy, Yury and Lebr{\'o}n, Federico and Sanghai, Sumit},
|
||||
journal={arXiv preprint arXiv:2305.13245},
|
||||
year={2023}
|
||||
}
|
||||
@article{voita2019bottom,
|
||||
title={The bottom-up evolution of representations in the transformer: A study with machine translation and language modeling objectives},
|
||||
author={Voita, Elena and Sennrich, Rico and Titov, Ivan},
|
||||
journal={arXiv preprint arXiv:1909.01380},
|
||||
year={2019}
|
||||
}
|
||||
@article{hu2023llm,
|
||||
title={Llm-adapters: An adapter family for parameter-efficient fine-tuning of large language models},
|
||||
author={Hu, Zhiqiang and Wang, Lei and Lan, Yihuai and Xu, Wanyu and Lim, Ee-Peng and Bing, Lidong and Xu, Xing and Poria, Soujanya and Lee, Roy Ka-Wei},
|
||||
journal={arXiv preprint arXiv:2304.01933},
|
||||
year={2023}
|
||||
}
|
||||
@article{team2024gemma,
|
||||
title={Gemma 2: Improving open language models at a practical size},
|
||||
author={Team, Gemma and Riviere, Morgane and Pathak, Shreya and Sessa, Pier Giuseppe and Hardin, Cassidy and Bhupatiraju, Surya and Hussenot, L{\'e}onard and Mesnard, Thomas and Shahriari, Bobak and Ram{\'e}, Alexandre and others},
|
||||
journal={arXiv preprint arXiv:2408.00118},
|
||||
year={2024}
|
||||
}
|
||||
@article{dubey2024llama,
|
||||
title={The llama 3 herd of models},
|
||||
author={Dubey, Abhimanyu and Jauhri, Abhinav and Pandey, Abhinav and Kadian, Abhishek and Al-Dahle, Ahmad and Letman, Aiesha and Mathur, Akhil and Schelten, Alan and Yang, Amy and Fan, Angela and others},
|
||||
journal={arXiv e-prints},
|
||||
pages={arXiv--2407},
|
||||
year={2024}
|
||||
}
|
||||
@article{team2024qwen2,
|
||||
title={Qwen2 technical report},
|
||||
author={Team, Qwen},
|
||||
journal={arXiv preprint arXiv:2407.10671},
|
||||
year={2024}
|
||||
}
|
||||
% Old
|
||||
|
||||
@article{sun2025stronger,
|
||||
title={A Stronger Mixture of Low-Rank Experts for Fine-Tuning Foundation Models},
|
||||
author={Sun, Mengyang and Wang, Yihao and Feng, Tao and Zhang, Dan and Zhu, Yifan and Tang, Jie},
|
||||
journal={arXiv preprint arXiv:2502.15828},
|
||||
year={2025}
|
||||
}
|
||||
@article{pfeiffer2020mad,
|
||||
title={Mad-x: An adapter-based framework for multi-task cross-lingual transfer},
|
||||
author={Pfeiffer, Jonas and Vuli{\'c}, Ivan and Gurevych, Iryna and Ruder, Sebastian},
|
||||
journal={arXiv preprint arXiv:2005.00052},
|
||||
year={2020}
|
||||
}
|
||||
@article{raffel2020exploring,
|
||||
title={Exploring the limits of transfer learning with a unified text-to-text transformer},
|
||||
author={Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J},
|
||||
journal={Journal of machine learning research},
|
||||
volume={21},
|
||||
number={140},
|
||||
pages={1--67},
|
||||
year={2020}
|
||||
}
|
||||
@article{zaken2021bitfit,
|
||||
title={Bitfit: Simple parameter-efficient fine-tuning for transformer-based masked language-models},
|
||||
author={Zaken, Elad Ben and Ravfogel, Shauli and Goldberg, Yoav},
|
||||
journal={arXiv preprint arXiv:2106.10199},
|
||||
year={2021}
|
||||
}
|
||||
@inproceedings{papineni2002bleu,
|
||||
title={Bleu: a method for automatic evaluation of machine translation},
|
||||
author={Papineni, Kishore and Roukos, Salim and Ward, Todd and Zhu, Wei-Jing},
|
||||
booktitle={Proceedings of the 40th annual meeting of the Association for Computational Linguistics},
|
||||
pages={311--318},
|
||||
year={2002}
|
||||
}
|
||||
@inproceedings{lin2004rouge,
|
||||
title={Rouge: A package for automatic evaluation of summaries},
|
||||
author={Lin, Chin-Yew},
|
||||
booktitle={Text summarization branches out},
|
||||
pages={74--81},
|
||||
year={2004}
|
||||
}
|
||||
@article{jang2016categorical,
|
||||
title={Categorical reparameterization with gumbel-softmax},
|
||||
author={Jang, Eric and Gu, Shixiang and Poole, Ben},
|
||||
journal={arXiv preprint arXiv:1611.01144},
|
||||
year={2016}
|
||||
}
|
||||
@inproceedings{he2015delving,
|
||||
title={Delving deep into rectifiers: Surpassing human-level performance on imagenet classification},
|
||||
author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
|
||||
booktitle={Proceedings of the IEEE international conference on computer vision},
|
||||
pages={1026--1034},
|
||||
year={2015}
|
||||
}
|
||||
@article{guo2025nlora,
|
||||
title={NLoRA: Nystr$\backslash$" om-Initiated Low-Rank Adaptation for Large Language Models},
|
||||
author={Guo, Chenlu and Wu, Yuan and Chang, Yi},
|
||||
journal={arXiv preprint arXiv:2502.14482},
|
||||
year={2025}
|
||||
}
|
||||
|
||||
@article{ba2016layer,
|
||||
title={Layer normalization},
|
||||
author={Ba, Jimmy Lei and Kiros, Jamie Ryan and Hinton, Geoffrey E},
|
||||
journal={arXiv preprint arXiv:1607.06450},
|
||||
year={2016}
|
||||
}
|
||||
|
||||
@article{team2023gemini,
|
||||
title={Gemini: a family of highly capable multimodal models},
|
||||
author={Team, Gemini and Anil, Rohan and Borgeaud, Sebastian and Alayrac, Jean-Baptiste and Yu, Jiahui and Soricut, Radu and Schalkwyk, Johan and Dai, Andrew M and Hauth, Anja and Millican, Katie and others},
|
||||
journal={arXiv preprint arXiv:2312.11805},
|
||||
year={2023}
|
||||
}
|
||||
@article{liu2023moelora,
|
||||
title={Moelora: An moe-based parameter efficient fine-tuning method for multi-task medical applications},
|
||||
author={Liu, Qidong and Wu, Xian and Zhao, Xiangyu and Zhu, Yuanshao and Xu, Derong and Tian, Feng and Zheng, Yefeng},
|
||||
journal={arXiv preprint arXiv:2310.18339},
|
||||
year={2023}
|
||||
}
|
||||
@article{wang2023multilora,
|
||||
title={Multilora: Democratizing lora for better multi-task learning},
|
||||
author={Wang, Yiming and Lin, Yu and Zeng, Xiaodong and Zhang, Guannan},
|
||||
journal={arXiv preprint arXiv:2311.11501},
|
||||
year={2023}
|
||||
}
|
||||
@article{liu2021p,
|
||||
title={P-tuning v2: Prompt tuning can be comparable to fine-tuning universally across scales and tasks},
|
||||
author={Liu, Xiao and Ji, Kaixuan and Fu, Yicheng and Tam, Weng Lam and Du, Zhengxiao and Yang, Zhilin and Tang, Jie},
|
||||
journal={arXiv preprint arXiv:2110.07602},
|
||||
year={2021}
|
||||
}
|
||||
@article{brown2020language,
|
||||
title={Language models are few-shot learners},
|
||||
author={Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and others},
|
||||
journal={Advances in neural information processing systems},
|
||||
volume={33},
|
||||
pages={1877--1901},
|
||||
year={2020}
|
||||
}
|
||||
@article{liu2021conflict,
|
||||
title={Conflict-averse gradient descent for multi-task learning},
|
||||
author={Liu, Bo and Liu, Xingchao and Jin, Xiaojie and Stone, Peter and Liu, Qiang},
|
||||
journal={Advances in Neural Information Processing Systems},
|
||||
volume={34},
|
||||
pages={18878--18890},
|
||||
year={2021}
|
||||
}
|
||||
@article{navon2022multi,
|
||||
title={Multi-task learning as a bargaining game},
|
||||
author={Navon, Aviv and Shamsian, Aviv and Achituve, Idan and Maron, Haggai and Kawaguchi, Kenji and Chechik, Gal and Fetaya, Ethan},
|
||||
journal={arXiv preprint arXiv:2202.01017},
|
||||
year={2022}
|
||||
}
|
||||
@article{yu2020gradient,
|
||||
title={Gradient surgery for multi-task learning},
|
||||
author={Yu, Tianhe and Kumar, Saurabh and Gupta, Abhishek and Levine, Sergey and Hausman, Karol and Finn, Chelsea},
|
||||
journal={Advances in Neural Information Processing Systems},
|
||||
volume={33},
|
||||
pages={5824--5836},
|
||||
year={2020}
|
||||
}
|
||||
@article{renduchintala2023tied,
|
||||
title={Tied-lora: Enhacing parameter efficiency of lora with weight tying},
|
||||
author={Renduchintala, Adithya and Konuk, Tugrul and Kuchaiev, Oleksii},
|
||||
journal={arXiv preprint arXiv:2311.09578},
|
||||
year={2023}
|
||||
}
|
||||
@inproceedings{kwon2023efficient,
|
||||
title={Efficient memory management for large language model serving with pagedattention},
|
||||
author={Kwon, Woosuk and Li, Zhuohan and Zhuang, Siyuan and Sheng, Ying and Zheng, Lianmin and Yu, Cody Hao and Gonzalez, Joseph and Zhang, Hao and Stoica, Ion},
|
||||
booktitle={Proceedings of the 29th Symposium on Operating Systems Principles},
|
||||
pages={611--626},
|
||||
year={2023}
|
||||
}
|
||||
|
||||
@article{dai2024deepseekmoe,
|
||||
title={Deepseekmoe: Towards ultimate expert specialization in mixture-of-experts language models},
|
||||
author={Dai, Damai and Deng, Chengqi and Zhao, Chenggang and Xu, RX and Gao, Huazuo and Chen, Deli and Li, Jiashi and Zeng, Wangding and Yu, Xingkai and Wu, Y and others},
|
||||
journal={arXiv preprint arXiv:2401.06066},
|
||||
year={2024}
|
||||
}
|
||||
@article{guo2025deepseek,
|
||||
title={Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning},
|
||||
author={Guo, Daya and Yang, Dejian and Zhang, Haowei and Song, Junxiao and Zhang, Ruoyu and Xu, Runxin and Zhu, Qihao and Ma, Shirong and Wang, Peiyi and Bi, Xiao and others},
|
||||
journal={arXiv preprint arXiv:2501.12948},
|
||||
year={2025}
|
||||
}
|
||||
@article{shazeer2017outrageously,
|
||||
title={Outrageously large neural networks: The sparsely-gated mixture-of-experts layer},
|
||||
author={Shazeer, Noam and Mirhoseini, Azalia and Maziarz, Krzysztof and Davis, Andy and Le, Quoc and Hinton, Geoffrey and Dean, Jeff},
|
||||
journal={arXiv preprint arXiv:1701.06538},
|
||||
year={2017}
|
||||
}
|
||||
@inproceedings{rajbhandari2022deepspeed,
|
||||
title={Deepspeed-moe: Advancing mixture-of-experts inference and training to power next-generation ai scale},
|
||||
author={Rajbhandari, Samyam and Li, Conglong and Yao, Zhewei and Zhang, Minjia and Aminabadi, Reza Yazdani and Awan, Ammar Ahmad and Rasley, Jeff and He, Yuxiong},
|
||||
booktitle={International conference on machine learning},
|
||||
pages={18332--18346},
|
||||
year={2022},
|
||||
organization={PMLR}
|
||||
}
|
||||
@article{zhang2023instruction,
|
||||
title={Instruction tuning for large language models: A survey},
|
||||
author={Zhang, Shengyu and Dong, Linfeng and Li, Xiaoya and Zhang, Sen and Sun, Xiaofei and Wang, Shuhe and Li, Jiwei and Hu, Runyi and Zhang, Tianwei and Wu, Fei and others},
|
||||
journal={arXiv preprint arXiv:2308.10792},
|
||||
year={2023}
|
||||
}
|
||||
@article{pfeiffer2020adapterfusion,
|
||||
title={Adapterfusion: Non-destructive task composition for transfer learning},
|
||||
author={Pfeiffer, Jonas and Kamath, Aishwarya and R{\"u}ckl{\'e}, Andreas and Cho, Kyunghyun and Gurevych, Iryna},
|
||||
journal={arXiv preprint arXiv:2005.00247},
|
||||
year={2020}
|
||||
}
|
||||
@article{pfeiffer2020adapterhub,
|
||||
title={Adapterhub: A framework for adapting transformers},
|
||||
author={Pfeiffer, Jonas and R{\"u}ckl{\'e}, Andreas and Poth, Clifton and Kamath, Aishwarya and Vuli{\'c}, Ivan and Ruder, Sebastian and Cho, Kyunghyun and Gurevych, Iryna},
|
||||
journal={arXiv preprint arXiv:2007.07779},
|
||||
year={2020}
|
||||
}
|
||||
|
||||
@article{lu2023uniadapter,
|
||||
title={Uniadapter: Unified parameter-efficient transfer learning for cross-modal modeling},
|
||||
author={Lu, Haoyu and Huo, Yuqi and Yang, Guoxing and Lu, Zhiwu and Zhan, Wei and Tomizuka, Masayoshi and Ding, Mingyu},
|
||||
journal={arXiv preprint arXiv:2302.06605},
|
||||
year={2023}
|
||||
}
|
||||
|
||||
@article{fedus2022switch,
|
||||
title={Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity},
|
||||
author={Fedus, William and Zoph, Barret and Shazeer, Noam},
|
||||
journal={Journal of Machine Learning Research},
|
||||
volume={23},
|
||||
number={120},
|
||||
pages={1--39},
|
||||
year={2022}
|
||||
}
|
||||
@article{lepikhin2020gshard,
|
||||
title={Gshard: Scaling giant models with conditional computation and automatic sharding},
|
||||
author={Lepikhin, Dmitry and Lee, HyoukJoong and Xu, Yuanzhong and Chen, Dehao and Firat, Orhan and Huang, Yanping and Krikun, Maxim and Shazeer, Noam and Chen, Zhifeng},
|
||||
journal={arXiv preprint arXiv:2006.16668},
|
||||
year={2020}
|
||||
}
|
||||
@article{luo2024moelora,
|
||||
title={Moelora: Contrastive learning guided mixture of experts on parameter-efficient fine-tuning for large language models},
|
||||
author={Luo, Tongxu and Lei, Jiahe and Lei, Fangyu and Liu, Weihao and He, Shizhu and Zhao, Jun and Liu, Kang},
|
||||
journal={arXiv preprint arXiv:2402.12851},
|
||||
year={2024}
|
||||
}
|
||||
@article{guo2024large,
|
||||
title={Large language model based multi-agents: A survey of progress and challenges},
|
||||
author={Guo, Taicheng and Chen, Xiuying and Wang, Yaqi and Chang, Ruidi and Pei, Shichao and Chawla, Nitesh V and Wiest, Olaf and Zhang, Xiangliang},
|
||||
journal={arXiv preprint arXiv:2402.01680},
|
||||
year={2024}
|
||||
}
|
||||
@article{zhao2023survey,
|
||||
title={A survey of large language models},
|
||||
author={Zhao, Wayne Xin and Zhou, Kun and Li, Junyi and Tang, Tianyi and Wang, Xiaolei and Hou, Yupeng and Min, Yingqian and Zhang, Beichen and Zhang, Junjie and Dong, Zican and others},
|
||||
journal={arXiv preprint arXiv:2303.18223},
|
||||
year={2023}
|
||||
}
|
||||
@article{gao2024higher,
|
||||
title={Higher layers need more lora experts},
|
||||
author={Gao, Chongyang and Chen, Kezhen and Rao, Jinmeng and Sun, Baochen and Liu, Ruibo and Peng, Daiyi and Zhang, Yawen and Guo, Xiaoyuan and Yang, Jie and Subrahmanian, VS},
|
||||
journal={arXiv preprint arXiv:2402.08562},
|
||||
year={2024}
|
||||
}
|
||||
@inproceedings{dou2024loramoe,
|
||||
title={LoRAMoE: Alleviating world knowledge forgetting in large language models via MoE-style plugin},
|
||||
author={Dou, Shihan and Zhou, Enyu and Liu, Yan and Gao, Songyang and Shen, Wei and Xiong, Limao and Zhou, Yuhao and Wang, Xiao and Xi, Zhiheng and Fan, Xiaoran and others},
|
||||
booktitle={Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
|
||||
pages={1932--1945},
|
||||
year={2024}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@article{achiam2023gpt,
|
||||
title={Gpt-4 technical report},
|
||||
author={Achiam, Josh and Adler, Steven and Agarwal, Sandhini and Ahmad, Lama and Akkaya, Ilge and Aleman, Florencia Leoni and Almeida, Diogo and Altenschmidt, Janko and Altman, Sam and Anadkat, Shyamal and others},
|
||||
journal={arXiv preprint arXiv:2303.08774},
|
||||
year={2023}
|
||||
}
|
||||
@article{jaszczur2021sparse,
|
||||
title={Sparse is enough in scaling transformers},
|
||||
author={Jaszczur, Sebastian and Chowdhery, Aakanksha and Mohiuddin, Afroz and Kaiser, Lukasz and Gajewski, Wojciech and Michalewski, Henryk and Kanerva, Jonni},
|
||||
journal={Advances in Neural Information Processing Systems},
|
||||
volume={34},
|
||||
pages={9895--9907},
|
||||
year={2021}
|
||||
}
|
||||
@inproceedings{standley2020tasks,
|
||||
title={Which tasks should be learned together in multi-task learning?},
|
||||
author={Standley, Trevor and Zamir, Amir and Chen, Dawn and Guibas, Leonidas and Malik, Jitendra and Savarese, Silvio},
|
||||
booktitle={International conference on machine learning},
|
||||
pages={9120--9132},
|
||||
year={2020},
|
||||
organization={PMLR}
|
||||
}
|
||||
@article{cai2024survey,
|
||||
title={A survey on mixture of experts},
|
||||
author={Cai, Weilin and Jiang, Juyong and Wang, Fan and Tang, Jing and Kim, Sunghun and Huang, Jiayi},
|
||||
journal={arXiv preprint arXiv:2407.06204},
|
||||
year={2024}
|
||||
}
|
||||
@article{karimi2021compacter,
|
||||
title={Compacter: Efficient low-rank hypercomplex adapter layers},
|
||||
author={Karimi Mahabadi, Rabeeh and Henderson, James and Ruder, Sebastian},
|
||||
journal={Advances in Neural Information Processing Systems},
|
||||
volume={34},
|
||||
pages={1022--1035},
|
||||
year={2021}
|
||||
}
|
||||
@article{bommasani2021opportunities,
|
||||
title={On the opportunities and risks of foundation models},
|
||||
author={Bommasani, Rishi and Hudson, Drew A and Adeli, Ehsan and Altman, Russ and Arora, Simran and von Arx, Sydney and Bernstein, Michael S and Bohg, Jeannette and Bosselut, Antoine and Brunskill, Emma and others},
|
||||
journal={arXiv preprint arXiv:2108.07258},
|
||||
year={2021}
|
||||
}
|
||||
@article{pan2024lisa,
|
||||
title={LISA: Layerwise Importance Sampling for Memory-Efficient Large Language Model Fine-Tuning},
|
||||
author={Pan, Rui and Liu, Xiang and Diao, Shizhe and Pi, Renjie and Zhang, Jipeng and Han, Chi and Zhang, Tong},
|
||||
journal={arXiv preprint arXiv:2403.17919},
|
||||
year={2024}
|
||||
}
|
||||
@article{feng2024mixture,
|
||||
title={Mixture-of-loras: An efficient multitask tuning for large language models},
|
||||
author={Feng, Wenfeng and Hao, Chuzhan and Zhang, Yuewei and Han, Yu and Wang, Hao},
|
||||
journal={arXiv preprint arXiv:2403.03432},
|
||||
year={2024}
|
||||
}
|
||||
@article{lester2021power,
|
||||
title={The power of scale for parameter-efficient prompt tuning},
|
||||
author={Lester, Brian and Al-Rfou, Rami and Constant, Noah},
|
||||
journal={arXiv preprint arXiv:2104.08691},
|
||||
year={2021}
|
||||
}
|
||||
@article{zhou2024lima,
|
||||
title={Lima: Less is more for alignment},
|
||||
author={Zhou, Chunting and Liu, Pengfei and Xu, Puxin and Iyer, Srinivasan and Sun, Jiao and Mao, Yuning and Ma, Xuezhe and Efrat, Avia and Yu, Ping and Yu, Lili and others},
|
||||
journal={Advances in Neural Information Processing Systems},
|
||||
volume={36},
|
||||
year={2024}
|
||||
}
|
||||
@article{wei2021finetuned,
|
||||
title={Finetuned language models are zero-shot learners},
|
||||
author={Wei, Jason and Bosma, Maarten and Zhao, Vincent Y and Guu, Kelvin and Yu, Adams Wei and Lester, Brian and Du, Nan and Dai, Andrew M and Le, Quoc V},
|
||||
journal={arXiv preprint arXiv:2109.01652},
|
||||
year={2021}
|
||||
}
|
||||
|
||||
@article{brynjolfsson2025generative,
|
||||
title={Generative AI at work},
|
||||
author={Brynjolfsson, Erik and Li, Danielle and Raymond, Lindsey},
|
||||
journal={The Quarterly Journal of Economics},
|
||||
pages={qjae044},
|
||||
year={2025},
|
||||
publisher={Oxford University Press}
|
||||
}
|
||||
@Misc{peft,
|
||||
title = {PEFT: State-of-the-art Parameter-Efficient Fine-Tuning methods},
|
||||
author = {Sourab Mangrulkar and Sylvain Gugger and Lysandre Debut and Younes Belkada and Sayak Paul and Benjamin Bossan},
|
||||
howpublished = {\url{https://github.com/huggingface/peft}},
|
||||
year = {2022}
|
||||
}
|
||||
@article{li2023chatdoctor,
|
||||
title={ChatDoctor: A Medical Chat Model Fine-Tuned on a Large Language Model Meta-AI (LLaMA) Using Medical Domain Knowledge},
|
||||
author={Li, Yunxiang and Li, Zihan and Zhang, Kai and Dan, Ruilong and Jiang, Steve and Zhang, You},
|
||||
journal={Cureus},
|
||||
volume={15},
|
||||
number={6},
|
||||
year={2023},
|
||||
publisher={Cureus}
|
||||
}
|
||||
@online{DatabricksBlog2023DollyV2,
|
||||
author = {Mike Conover and Matt Hayes and Ankit Mathur and Jianwei Xie and Jun Wan and Sam Shah and Ali Ghodsi and Patrick Wendell and Matei Zaharia and Reynold Xin},
|
||||
title = {Free Dolly: Introducing the World's First Truly Open Instruction-Tuned LLM},
|
||||
year = {2023},
|
||||
url = {https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm},
|
||||
urldate = {2023-06-30}
|
||||
}
|
||||
@inproceedings{nakano2021webgpt,
|
||||
author = {Reiichiro Nakano and Jacob Hilton and Suchir Balaji and Jeff Wu and Long Ouyang and Christina Kim and Christopher Hesse and Shantanu Jain and Vineet Kosaraju and William Saunders and Xu Jiang and Karl Cobbe and Tyna Eloundou and Gretchen Krueger and Kevin Button and Matthew Knight and Benjamin Chess and John Schulman},
|
||||
title = {WebGPT: Browser-assisted question-answering with human feedback},
|
||||
booktitle = {arXiv},
|
||||
year = 2021,
|
||||
}
|
||||
@inproceedings{zhang2023automatic,
|
||||
title={Automatic Chain of Thought Prompting in Large Language Models},
|
||||
author={Zhang, Zhuosheng and Zhang, Aston and Li, Mu and Smola, Alex},
|
||||
booktitle={The Eleventh International Conference on Learning Representations (ICLR 2023)},
|
||||
year={2023}
|
||||
}
|
||||
@misc{codealpaca,
|
||||
author = {Sahil Chaudhary},
|
||||
title = {Code Alpaca: An Instruction-following LLaMA model for code generation},
|
||||
year = {2023},
|
||||
publisher = {GitHub},
|
||||
journal = {GitHub repository},
|
||||
howpublished = {\url{https://github.com/sahil280114/codealpaca}},
|
||||
}
|
||||
@article{zhao2024hypermoe,
|
||||
title={HyperMoE: Towards Better Mixture of Experts via Transferring Among Experts},
|
||||
author={Zhao, Hao and Qiu, Zihan and Wu, Huijia and Wang, Zili and He, Zhaofeng and Fu, Jie},
|
||||
journal={arXiv preprint arXiv:2402.12656},
|
||||
year={2024}
|
||||
}
|
||||
1029
mypaper/KDD2026_DyPAM.tex
Normal file
1029
mypaper/KDD2026_DyPAM.tex
Normal file
File diff suppressed because it is too large
Load Diff
244
mypaper/arXiv_POI-QA.bib
Normal file
244
mypaper/arXiv_POI-QA.bib
Normal file
@@ -0,0 +1,244 @@
|
||||
@article{bjerva2020subjqa,
|
||||
author = {Johannes Bjerva and Nikita Bhutani and Behzad Golshan and Wang-Chiew Tan and Isabelle Augenstein},
|
||||
title = {SubjQA: A Dataset for Subjectivity and Review Comprehension},
|
||||
journal = {arXiv preprint arXiv:2004.14283},
|
||||
eprint = {2004.14283},
|
||||
archivePrefix = {arXiv},
|
||||
year = {2020}
|
||||
}
|
||||
|
||||
@inproceedings{contractor2021answering,
|
||||
author = {Danish Contractor and Krunal Shah and Aditi Partap and Parag Singla and Mausam},
|
||||
title = {Answering POI-Recommendation Questions Using Tourism Reviews},
|
||||
booktitle = {Proceedings of the 30th ACM International Conference on Information \& Knowledge Management},
|
||||
pages = {281--291},
|
||||
year = {2021}
|
||||
}
|
||||
|
||||
@inproceedings{deng2023spatio,
|
||||
author = {Pan Deng and Yu Zhao and Junting Liu and Xiaofeng Jia and Mulan Wang},
|
||||
title = {Spatio-Temporal Neural Structural Causal Models for Bike Flow Prediction},
|
||||
booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence},
|
||||
volume = {37},
|
||||
pages = {4242--4249},
|
||||
year = {2023}
|
||||
}
|
||||
|
||||
@article{dong2022spatiotemporal,
|
||||
author = {Qidi Dong and Jun Cai and Shuo Chen and Pengman He and Xuli Chen},
|
||||
title = {Spatiotemporal Analysis of Urban Green Spatial Vitality and the Corresponding Influencing Factors: A Case Study of Chengdu, China},
|
||||
journal = {Land},
|
||||
volume = {11},
|
||||
number = {10},
|
||||
pages = {1820},
|
||||
year = {2022}
|
||||
}
|
||||
|
||||
@article{feng2024citygpt,
|
||||
author = {Jie Feng and Yuwei Du and Tianhui Liu and Siqi Guo and Yuming Lin and Yong Li},
|
||||
title = {CityGPT: Empowering Urban Spatial Cognition of Large Language Models},
|
||||
journal = {arXiv preprint arXiv:2406.13948},
|
||||
eprint = {2406.13948},
|
||||
archivePrefix = {arXiv},
|
||||
year = {2024}
|
||||
}
|
||||
|
||||
@article{grattafiori2024llama,
|
||||
author = {Aaron Grattafiori and Abhimanyu Dubey and Abhinav Jauhri and Abhinav Pandey and Abhishek Kadian and Ahmad Al-Dahle and Aiesha Letman and Akhil Mathur and Alan Schelten and Alex Vaughan and others},
|
||||
title = {The Llama 3 Herd of Models},
|
||||
journal = {arXiv preprint arXiv:2407.21783},
|
||||
eprint = {2407.21783},
|
||||
archivePrefix = {arXiv},
|
||||
year = {2024}
|
||||
}
|
||||
|
||||
@article{gruber2024complextempqa,
|
||||
author = {Raphael Gruber and Abdelrahman Abdallah and Michael F{\"a}rber and Adam Jatowt},
|
||||
title = {ComplexTempQA: A Large-Scale Dataset for Complex Temporal Question Answering},
|
||||
journal = {arXiv preprint arXiv:2406.04866},
|
||||
eprint = {2406.04866},
|
||||
archivePrefix = {arXiv},
|
||||
year = {2024}
|
||||
}
|
||||
|
||||
@article{hu2022lora,
|
||||
author = {Edward J. Hu and Yelong Shen and Phillip Wallis and Zeyuan Allen-Zhu and Yuanzhi Li and Shean Wang and Lu Wang and Weizhu Chen and others},
|
||||
title = {LoRA: Low-Rank Adaptation of Large Language Models},
|
||||
journal = {ICLR},
|
||||
volume = {1},
|
||||
number = {2},
|
||||
pages = {3},
|
||||
year = {2022}
|
||||
}
|
||||
|
||||
@inproceedings{jia2018tempquestions,
|
||||
author = {Zhen Jia and Abdalghani Abujabal and Rishiraj Saha Roy and Jannik Str{\"o}tgen and Gerhard Weikum},
|
||||
title = {TempQuestions: A Benchmark for Temporal Question Answering},
|
||||
booktitle = {Companion Proceedings of the The Web Conference 2018},
|
||||
pages = {1057--1062},
|
||||
year = {2018}
|
||||
}
|
||||
|
||||
@article{joshi2017triviaqa,
|
||||
author = {Mandar Joshi and Eunsol Choi and Daniel S. Weld and Luke Zettlemoyer},
|
||||
title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},
|
||||
journal = {arXiv preprint arXiv:1705.03551},
|
||||
eprint = {1705.03551},
|
||||
archivePrefix = {arXiv},
|
||||
year = {2017}
|
||||
}
|
||||
|
||||
@article{kwiatkowski2019natural,
|
||||
author = {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh and Chris Alberti and Danielle Epstein and Illia Polosukhin and Jacob Devlin and Kenton Lee and others},
|
||||
title = {Natural Questions: A Benchmark for Question Answering Research},
|
||||
journal = {Transactions of the Association for Computational Linguistics},
|
||||
volume = {7},
|
||||
pages = {453--466},
|
||||
year = {2019}
|
||||
}
|
||||
|
||||
@article{lewis2020retrieval,
|
||||
author = {Patrick Lewis and Ethan Perez and Aleksandra Piktus and Fabio Petroni and Vladimir Karpukhin and Naman Goyal and Heinrich K{\"u}ttler and Mike Lewis and Wen-tau Yih and Tim Rockt{\"a}schel and others},
|
||||
title = {Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks},
|
||||
journal = {Advances in Neural Information Processing Systems},
|
||||
volume = {33},
|
||||
pages = {9459--9474},
|
||||
year = {2020}
|
||||
}
|
||||
|
||||
@article{li2024stbench,
|
||||
author = {Wenbin Li and Di Yao and Ruibo Zhao and Wenjie Chen and Zijie Xu and Chengxue Luo and Chang Gong and Quanliang Jing and Haining Tan and Jingping Bi},
|
||||
title = {STBench: Assessing the Ability of Large Language Models in Spatio-Temporal Analysis},
|
||||
journal = {arXiv preprint arXiv:2406.19065},
|
||||
eprint = {2406.19065},
|
||||
archivePrefix = {arXiv},
|
||||
year = {2024}
|
||||
}
|
||||
|
||||
@inproceedings{DBLP:conf/ijcai/LiCLYH21,
|
||||
author = {Yang Li and Tong Chen and Yadan Luo and Hongzhi Yin and Zi Huang},
|
||||
title = {Discovering Collaborative Signals for Next {POI} Recommendation with Iterative Seq2Graph Augmentation},
|
||||
booktitle = {Proceedings of the Thirtieth International Joint Conference on Artificial Intelligence, {IJCAI} 2021},
|
||||
pages = {1491--1497},
|
||||
year = {2021},
|
||||
doi = {10.24963/IJCAI.2021/206},
|
||||
url = {https://doi.org/10.24963/ijcai.2021/206}
|
||||
}
|
||||
|
||||
@article{li2025mapqa,
|
||||
author = {Zekun Li and Malcolm Grossman and Mihir Kulkarni and Muhao Chen and Yao-Yi Chiang and others},
|
||||
title = {MapQA: Open-Domain Geospatial Question Answering on Map Data},
|
||||
journal = {arXiv preprint arXiv:2503.07871},
|
||||
eprint = {2503.07871},
|
||||
archivePrefix = {arXiv},
|
||||
year = {2025}
|
||||
}
|
||||
|
||||
@article{ma2023evolution,
|
||||
author = {Dongling Ma and Baoze Liu and Qingji Huang and Qian Zhang},
|
||||
title = {Evolution Characteristics and Causes---An Analysis of Urban Catering Cluster Spatial Structure},
|
||||
journal = {ISPRS International Journal of Geo-Information},
|
||||
volume = {12},
|
||||
number = {8},
|
||||
pages = {302},
|
||||
year = {2023}
|
||||
}
|
||||
|
||||
@inproceedings{mai2018poireviewqa,
|
||||
author = {Gengchen Mai and Krzysztof Janowicz and Cheng He and Sumang Liu and Ni Lao},
|
||||
title = {POIReviewQA: A Semantically Enriched POI Retrieval and Question Answering Dataset},
|
||||
booktitle = {Proceedings of the 12th Workshop on Geographic Information Retrieval},
|
||||
pages = {1--2},
|
||||
year = {2018}
|
||||
}
|
||||
|
||||
@article{mateos2025systematic,
|
||||
author = {Pablo Mateos and Alejandro Bellog{\'\i}n},
|
||||
title = {A Systematic Literature Review of Recent Advances on Context-Aware Recommender Systems},
|
||||
journal = {Artificial Intelligence Review},
|
||||
volume = {58},
|
||||
number = {1},
|
||||
pages = {1--53},
|
||||
year = {2025}
|
||||
}
|
||||
|
||||
@article{tang2022discovering,
|
||||
author = {Wen Tang and Alireza Chakeri and Hamid Krim},
|
||||
title = {Discovering Urban Functional Zones from Biased and Sparse Points of Interests and Sparse Human Activities},
|
||||
journal = {Expert Systems with Applications},
|
||||
volume = {207},
|
||||
pages = {118062},
|
||||
year = {2022}
|
||||
}
|
||||
|
||||
@article{wan2023spatio,
|
||||
author = {Zhongwei Wan and Xin Liu and Benyou Wang and Jiezhong Qiu and Boyu Li and Ting Guo and Guangyong Chen and Yang Wang},
|
||||
title = {Spatio-Temporal Contrastive Learning-Enhanced GNNs for Session-Based Recommendation},
|
||||
journal = {ACM Transactions on Information Systems},
|
||||
volume = {42},
|
||||
number = {2},
|
||||
pages = {1--26},
|
||||
year = {2023}
|
||||
}
|
||||
|
||||
@article{wang2024environmental,
|
||||
author = {Hongcheng Wang and Linfei Li and Xin Xu},
|
||||
title = {Do Environmental Regulation Policies Increase Urban Boundary Pollution? Micro Evidence from Chinese Industrial Enterprises},
|
||||
journal = {Environmental Impact Assessment Review},
|
||||
volume = {106},
|
||||
pages = {107524},
|
||||
year = {2024}
|
||||
}
|
||||
|
||||
@article{wang2021spatio,
|
||||
author = {Huandong Wang and Qiaohong Yu and Yu Liu and Depeng Jin and Yong Li},
|
||||
title = {Spatio-Temporal Urban Knowledge Graph Enabled Mobility Prediction},
|
||||
journal = {Proceedings of the ACM on Interactive, Mobile, Wearable and Ubiquitous Technologies},
|
||||
volume = {5},
|
||||
number = {4},
|
||||
pages = {1--24},
|
||||
year = {2021}
|
||||
}
|
||||
|
||||
@article{yang2024qwen2,
|
||||
author = {An Yang and Baosong Yang and Beichen Zhang and Binyuan Hui and Bo Zheng and Bowen Yu and Chengyuan Li and Dayiheng Liu and Fei Huang and Haoran Wei and others},
|
||||
title = {Qwen2.5 Technical Report},
|
||||
journal = {arXiv preprint arXiv:2412.15115},
|
||||
eprint = {2412.15115},
|
||||
archivePrefix = {arXiv},
|
||||
year = {2024}
|
||||
}
|
||||
|
||||
@inproceedings{yang2015wikiqa,
|
||||
author = {Yi Yang and Wen-tau Yih and Christopher Meek},
|
||||
title = {WikiQA: A Challenge Dataset for Open-Domain Question Answering},
|
||||
booktitle = {Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing},
|
||||
pages = {2013--2018},
|
||||
year = {2015}
|
||||
}
|
||||
|
||||
@article{yu2024survey,
|
||||
author = {Jian Yu and Lucas Guo and Jiayu Zhang and Guiling Wang},
|
||||
title = {A Survey on Graph Neural Network-Based Next POI Recommendation for Smart Cities},
|
||||
journal = {Journal of Reliable Intelligent Environments},
|
||||
volume = {10},
|
||||
number = {3},
|
||||
pages = {299--318},
|
||||
year = {2024}
|
||||
}
|
||||
|
||||
@book{yu2017chinese,
|
||||
author = {Li Yu},
|
||||
title = {Chinese City and Regional Planning Systems},
|
||||
publisher = {Routledge},
|
||||
year = {2017}
|
||||
}
|
||||
|
||||
@article{yu2024bigcity,
|
||||
author = {Xie Yu and Jingyuan Wang and Yifan Yang and Qian Huang and Ke Qu},
|
||||
title = {BigCity: A Universal Spatiotemporal Model for Unified Trajectory and Traffic State Data Analysis},
|
||||
journal = {arXiv preprint arXiv:2412.00953},
|
||||
eprint = {2412.00953},
|
||||
archivePrefix = {arXiv},
|
||||
year = {2024}
|
||||
}
|
||||
532
mypaper/arXiv_POI-QA.tex
Normal file
532
mypaper/arXiv_POI-QA.tex
Normal file
@@ -0,0 +1,532 @@
|
||||
\title{A Dataset for Spatiotemporal-Sensitive\\POI Question Answering}
|
||||
|
||||
\begin{document}
|
||||
\maketitle
|
||||
|
||||
\begin{abstract}
|
||||
Spatiotemporal relationships are critical in data science, as many prediction and reasoning tasks require analysis across both spatial and temporal dimensions—for instance, navigating an unfamiliar city involves planning itineraries that sequence locations and timing cultural experiences.
|
||||
However, existing Question-Answering (QA) datasets lack sufficient spatiotemporal-sensitive questions, making them inadequate benchmarks for evaluating models' spatiotemporal reasoning capabilities.
|
||||
To address this gap, we introduce \name, a novel spatiotemporal-sensitive QA dataset centered on Point of Interest (POI), constructed through three key steps: mining and aligning open-source vehicle trajectory data from GAIA with high-precision geographic POI data, rigorous manual validation of noisy spatiotemporal facts, and generating bilingual (Chinese/English) QA pairs that reflect human-understandable spatiotemporal reasoning tasks.
|
||||
Our dataset challenges models to parse complex spatiotemporal dependencies, and evaluations of state-of-the-art multilingual LLMs (\emph{e.g.,} Qwen2.5-7B, Llama3.1-8B) reveal stark limitations: even the top-performing model (Qwen2.5-7B fine-tuned with RAG+LoRA) achieves a top 10 Hit Ratio (HR@10) of only 0.41 on the easiest task, far below human performance at 0.56.
|
||||
This underscores persistent weaknesses in LLMs’ ability to perform consistent spatiotemporal reasoning, while highlighting \name\ as a robust benchmark to advance algorithms sensitive to spatiotemporal dynamics. The dataset is publicly available at \datalink.
|
||||
\end{abstract}
|
||||
\section{Introduction}
|
||||
|
||||
|
||||
Spatiotemporal reasoning plays a pivotal role in a wide range of prediction and decision-making tasks that require sensitivity to both spatial and temporal contexts.
|
||||
This capability depends heavily on spatiotemporal information, which encompasses spatial data, such as geographic locations, and temporal data like time of day or sequential time-based patterns.
|
||||
As a result, spatiotemporal reasoning has become an essential focus in recent research across domains including mobility analysis, personalized recommendation, and spatiotemporal prediction tasks~ \cite{wan2023spatio,deng2023spatio,wang2021spatio}.
|
||||
The integration of spatiotemporal reasoning into decision-making processes is not confined to technological applications but is also deeply embedded in the daily routines and choices of individuals~\cite{mateos2025systematic}.
|
||||
For instance, when planning a journey, travelers often consider factors such as the geographical proximity of restaurants offering local specialties and the time required to reach these establishments.
|
||||
This example underscores how both spatial and temporal elements are crucial for making informed decisions.
|
||||
Among the domains where spatiotemporal reasoning is essential, Point of Interest (POI) recommendation stands out as a representative and challenging example. To effectively identify appropriate POIs, models must possess robust spatiotemporal reasoning capabilities. These capabilities enable models to analyze historical user behavior patterns, predict future preferences, and recommend POIs that align with users' interests while accounting for constraints like time and location.
|
||||
In essence, the ability to reason about space and time is fundamental for developing intelligent recommendation systems that cater to diverse user needs and preferences~\cite{yu2024survey}.
|
||||
|
||||
In this paper, we focus on addressing the spatiotemporal challenges of POI prediction with precision and rigor.
|
||||
We formally define POI prediction at travel destinations as spatiotemporal questions based on the following four criteria:
|
||||
\textbf{i) Spatiotemporal Presence:} The question contains both a timestamp, [time], and a geolocation, [place], such as ``Tuesday evening'' and ``221B Baker Street'';
|
||||
\textbf{ii) Spatiotemporal Context Sensitivity:} Answers to similar questions may vary depending on differences in time and/or location, \ie altering the [time] or [place] can result in different answers.
|
||||
\textbf{iii) Spatiotemporal Knowledge Reasoning:} Such questions require broad POI data coverage and the ability to perform spatiotemporal reasoning.
|
||||
\textbf{iv) Human-Readable Answer:} The answer should align with effective human-computer interaction principles, such as providing the POI name along with a specific address rather than raw latitude and longitude coordinates.
|
||||
We found that, despite their ubiquity, spatiotemporal-sensitive questions are under-studied in existing POI QA datasets.
|
||||
For example, SubjQA~\cite{bjerva2020subjqa} focuses on attribute-oriented questions derived from POI reviews, requiring only semantic knowledge and lacking spatial or temporal information. MapQA~\cite{li2025mapqa} supports geographic queries but omits any temporal context. TourismQA~\cite{contractor2021answering}, although built from tourism reviews and containing questions related to time or place, lacks the ability to perform spatiotemporal reasoning.
|
||||
All of these datasets do not consider spatiotemporal-sensitive issues as specified in criterion ii).
|
||||
|
||||
One of the datasets closest to ours is Foursquare\footnote{https://opensource.foursquare.com/os-places/}, which provides a large amount of POI location information worldwide, along with a large number of user check-in data with timestamps.
|
||||
However, question samples extracted from the above-mentioned dataset fail to meet criteria ii), iii), and iv).
|
||||
Furthermore, the spatiotemporal information in the Foursquare dataset is relatively sparse and fragmented, as many users check in at different POIs on the platform with gaps of several days.
|
||||
Therefore, we propose to construct our own dataset, called \name. We first identify spatiotemporal-evolving relationships from both GAIA trajectory data\footnote{https://outreach.didichuxing.com/} and POI information around those real-time trajectories.
|
||||
Then, a massive number of human workers are employed to annotate the POIs surrounding each GPS point in every trajectory, especially focusing on double-checking the POIs near pick-up and drop-off locations.
|
||||
Finally, we created bilingual datasets (in Simplified Chinese and English) with multiple levels of granularity, corresponding to different levels of question difficulty. These levels include POI name, POI subcategory, POI medium category, and POI major category. Each level contains over 5,000,000 question-answer pairs, covering about 400,000 distinct POI locations and 30 consecutive days of vehicle trajectory data.
|
||||
Using POI names as labels in QA pairs is more challenging, as it requires more spatiotemporal reasoning and natural language understanding compared to other classification tasks.
|
||||
Figure~\ref{fig:illustration} shows two trajectories and their corresponding QA examples from the \name\ dataset, constructed using both trajectory facts and synthesized contextual information. Although both vehicles depart at similar times on Tuesday, the spatial variation in their departure points leads to different routes and destination contexts. This example highlights the strong spatiotemporal sensitivity of our dataset, where even slight spatial shifts under similar temporal conditions can significantly impact the question context, requiring models to perform spatiotemporal reasoning.
|
||||
3The challenges posed by our dataset fall into three folds:
|
||||
\begin{itemize}[leftmargin=*]
|
||||
\item \textbf{Geographic Knowledge Processing}: This involves accurately identifying and categorizing POIs based on their geographic locations. For example, recognizing that a ``McDonald's'' in a bustling city center may have different operating hours compared to one in a quieter suburban area.
|
||||
|
||||
\item \textbf{Temporal Information Understanding}: This requires the system to understand how temporal factors affect POI availability or relevance. For instance, recognizing that a restaurant may be open for dinner on weekdays but closed on weekends.
|
||||
|
||||
\item \textbf{Spatiotemporal Reasoning}: This involves combining both geographic and temporal information to provide accurate predictions. For example, recognizing that a user asking about the best places to eat near their home at 8pm is likely looking for a restaurant that is still open and close to home.
|
||||
|
||||
\end{itemize}
|
||||
|
||||
\begin{figure}
|
||||
\centering
|
||||
\includegraphics[width=0.95\linewidth]{figs/illustration.png}
|
||||
\caption{A toy example of spatiotemporal sensitive questions.}
|
||||
\label{fig:illustration}
|
||||
\end{figure}
|
||||
|
||||
We evaluate the performance of different state-of-the-art open-source Large Language Models (LLMs) on \name\ across all levels of granularity and observe that the average HR@10 drops from 0.39 on the coarse-grained ``POI Major Category'' task to 0.21 on the fine-grained ``POI Subcategory'' task, indicating that current models struggle with spatiotemporal understanding and reasoning.
|
||||
In contrast, human performance on the POI Subcategory task reaches an HR@10 of 0.57, highlighting a substantial gap between existing advanced models and human capabilities.
|
||||
Therefore, we believe \name\ could serve as a valuable benchmark for studying this problem.
|
||||
\section{\name\ Dataset}
|
||||
|
||||
In this section, we demonstrate the pipeline constructing our dataset, \name.
|
||||
It consists of three steps:
|
||||
i) geographic annotation of POIs,
|
||||
ii) trajectory-based POI mapping, and
|
||||
iii) spatiotemporal question-answer pair generation.
|
||||
|
||||
\subsection{Geographic Annotation of POIs}
|
||||
|
||||
|
||||
Before POI annotation, the choice of POI locations is critical \cite{tang2022discovering,DBLP:conf/ijcai/LiCLYH21}: In sparsely populated areas, POIs tend to be distributed sparsely as well, and the resulting datasets are usually of low quality.
|
||||
On a global scale, Chinese cities have the characteristics of high population density and thriving regional economic activities \cite{ma2023evolution,yu2017chinese}.
|
||||
These lead to a large number of POIs and rich types, making such cities particularly suitable for POI annotation.
|
||||
Therefore, we chose Chengdu, a Chinese city with a population in the tens of millions \cite{dong2022spatiotemporal}, as a suitable location for the dataset.
|
||||
|
||||
Although POIs in a city, such as store openings, relocations, or closures, may evolve over time, these dynamic changes are simplified in our constructed dataset to ensure consistency with the time frame of GAIA Data.
|
||||
To align with this requirement, we first collected 418,854 POI entries from map engines as of the end of 2016.
|
||||
After rigorous screening, we retained 418,579 POIs that remained stable over the period and excluded 275 POIs that had undergone changes.
|
||||
The POI annotation process followed four core steps:
|
||||
|
||||
\textbf{Data Collection via Map Search Engines}:
|
||||
We crawled POI data from two major map search engines in mainland China: Baidu Maps\footnote{https://lbsyun.baidu.com/} and Amap\footnote{https://lbs.amap.com/}. To ensure comprehensive coverage, we partitioned Chengdu into a grid system of 500x500 cells, each approximately 300 meters in length and width. For each grid, we retrieved and queried nearby POIs of the center point from the search engines.
|
||||
|
||||
\textbf{Data Cleaning and Standardization}:
|
||||
Duplicate entries from the search engine results were removed. Subsequently, we standardized the geographic coordinates of each POI to the WGS84 coordinate system to ensure uniformity \cite{wang2024environmental}.
|
||||
|
||||
\textbf{Coordinate Validation and Error Thresholds}:
|
||||
We calculated the coordinate discrepancy between the same POI across platforms. POIs with a coordinate difference of <1e-4 were retained and recorded. For discrepancies between 1e-4 and 1e-3, a manual review process was conducted to verify and retain valid POIs. Those with errors exceeding 1e-3 were excluded due to potential inaccuracies.
|
||||
|
||||
\textbf{Hierarchical Categorization}: In order to describe the POI more clearly, we manually marked all the collected POIs again.
|
||||
Each POI point has 3 category labels: major category, medium category and subcategory.
|
||||
For the entire POI dataset, we have divided it into 19 major categories, 122 medium categories and 959 subcategories.
|
||||
For more details, please refer to Appendix \ref{app:Dataset}.
|
||||
|
||||
This systematic approach ensured the reliability and temporal consistency of the POI dataset in alignment with GAIA Data’s requirements.
|
||||
|
||||
\subsection{Trajectory-based POI Mapping}
|
||||
|
||||
The POI mapping takes three steps: mining spatiotemporal-evolving travel targets from GAIA data, aligning geographic information with POIs, and human verification.
|
||||
|
||||
\textbf{Mining Spatiotemporal-evolving Travel Targets from GAIA Data}: We first utilize existing vehicle location records from GAIA Data to identify trajectories with distinctive spatiotemporal migration patterns.
|
||||
Subsequently, we employ this data to mine trips that exhibit temporal and spatial evolution.
|
||||
For instance, the vehicle ID ``6c8a8d17e6bbe4cd2fcdb4991b52725e'' in the GAIA Data produces various trip patterns: some travel directly from entertainment venues via main roads to nearby residential areas during weekday evenings, while others divert from community gates to nearby educational institutions on holiday mornings.
|
||||
These behaviors reflect clear spatiotemporal orientations, such as individuals returning home after nightlife activities or students attending weekend cram schools.
|
||||
By screening and filtering vehicle trajectory records with discernible objectives, we successfully extracted over 6 million trajectories characterized by prominent spatiotemporal migration patterns.
|
||||
These trajectories are formatted as: ``carID, timestamp and the location at the pickup point, the positioning sequence during the trip, and the drop-off location.''
|
||||
|
||||
\textbf{Aligning Geographic Information with POIs}:
|
||||
In the task of predicting points of interest at travel destinations, it is essential to map POIs along trajectories, particularly focusing on those near the start and end points.
|
||||
This approach addresses the need to avoid private information found in order details or GPS sequences.
|
||||
Our objectives include anonymization and associating POIs during data processing.
|
||||
The process involves four key steps:
|
||||
i) downsampling the trajectory by retaining positioning information at critical intersections and congestion points while eliminating redundancies;
|
||||
ii) matching all POIs within a 100-meter radius of start and end points, listed from nearest to farthest;
|
||||
iii) using the closest POI for journey positioning points to obscure exact paths;
|
||||
and iv) simplifying timestamps to day of the week and hour.
|
||||
Each track record is then formatted as: ``anonymous carID, timestamp, POIs near pickup location, POIs during trip, POIs near drop-off location.'' This method ensures privacy while maintaining data utility for effective destination prediction.
|
||||
|
||||
\textbf{Human Verification}:
|
||||
In the prior step, automated programs generate noisy data in batches. The primary sources of errors include:
|
||||
i) anomalies and drifts in trajectory points within GAIA data;
|
||||
and ii) start or end points situated in city suburbs with low POI coverage, leading to unclear descriptions of trajectory endpoints.
|
||||
To address these issues, we employ manual verification by hiring workers.
|
||||
This process involves the following measures:
|
||||
i) Display the start (end) point of the trajectory alongside nearby POIs (from nearest to farthest, shaded from dark to light) on a single map. Identify and mark records with missing or problematic information, correcting POI details if manually matched.
|
||||
ii) Visualize downsampled trajectories directly within the road network. Identify and mark trajectories with obvious anomalies or discontinuities, rectifying waypoints as needed.
|
||||
iii) Assign each trajectory record to at least five different workers for evaluation. If a record is flagged by more than 60\% of evaluators, it is either deleted or adjusted according to the majority opinion.
|
||||
It ensures data accuracy and reliability through systematic manual verification.
|
||||
|
||||
\subsection{Spatiotemporal Question-Answer Pair Generation}
|
||||
|
||||
Once we have the precise trajectory-POI matching records, the next step involves generating question-answer pairs that exhibit spatiotemporal correlation.
|
||||
|
||||
\begin{table}[ht]
|
||||
\centering
|
||||
\footnotesize
|
||||
\caption{The dataset statistics.}
|
||||
\label{tab:difficulty}
|
||||
\begin{tabular}{cccc}
|
||||
\toprule
|
||||
\multicolumn{1}{c}{\bfseries{Type}} & \multicolumn{1}{c}{\bfseries{Difficulty}} & \multicolumn{1}{c}{\makecell{\bfseries{Label }\\ \bfseries{Categories}}} & \multicolumn{1}{c}{\bfseries{Specifier}}\\
|
||||
\midrule
|
||||
Major Category Classification & Easy & 19 & \makecell{POIs at travel destination are: \\$[$ \re{Lifestyle Services}, \re{Shopping Service}, ...$]$} \\
|
||||
&&&\\
|
||||
Medium Category Classification & Medium & 122 & \makecell{POIs at travel destination are: \\$[$\re{Beauty Salon}, \re{Supermarket}, ...$]$} \\
|
||||
&&&\\
|
||||
Subcategory Classification & Hard & 959 & \makecell{POIs at travel destination are: \\$[$\re{Plastic Surgery | Healthcare Services}, \\\re{Hui Kang Supermarket}, \\\re{Wanning Supermarket}, ...$]$} \\
|
||||
&&&\\
|
||||
POI Name Generation & Very Hard & 400K+
|
||||
& \makecell{POIs at travel destination are: \\$[$\re{tai shi xing cai yi xue mei rong}\\\re{(No. 75 Fuqiang Street)}, \re{Wanning}\\ \re{(cheng du fu li guang chang)}, ...$]$} \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
|
||||
|
||||
\textbf{Main QA Dataset}:
|
||||
Our dataset consists of two components. The first part contains POI information, describing the locations and spatial relationships of various POIs. The second part is our main dataset, specifically designed for predicting POIs at travel destinations. Both datasets are generated using templates. Since the data originates from China, we provide both simplified Chinese and English versions to support multilingual model training.
|
||||
|
||||
|
||||
The synthesizing procedure is described in Figure~\ref{fig:QA_sample_synthesizing}.
|
||||
As shown in Figure~\ref{fig:QA_sample_synthesizing}, we use '<>' to represent the POI name.
|
||||
Since the English translation of most POIs has no specific meaning, we use the three phrases in '()' to represent the major category, medium category, and subcategory of the POI.
|
||||
In order to be closer to life and easier for people to understand directly, we also use both addresses in natural language and longitude-latitude coordinates to describe the geographical location of the POI.
|
||||
Finally, for each POI, we list the nearby POIs and the distances from these POIs to the current POI in the form of an array from near to far.
|
||||
For the POI prediction sample, we take the POI information near the starting point of the vehicle trajectory and the waypoint as the problem, and take the POI near the end of the vehicle trajectory as the predicted label.
|
||||
The predicted label is a list form represented by '[]'.
|
||||
Each record in the list is a POI point, including the POI name and its corresponding three categories.
|
||||
Therefore, we can use this dataset for two major tasks: classification task and generation task, as shown in Table~\ref{tab:difficulty}.
|
||||
For the classification task, we hope to build a model that can determine the classification category (major category, medium category, and subcategory) of the POI near the destination; for the generation task, we hope to build a model that can directly output the name of the POI near the destination.
|
||||
The difficulty of these four tasks increases in turn, and their comprehensive data are shown in Table~\ref{tab:difficulty}.
|
||||
The license information of the dataset is listed in Appendix~\ref{app:accessibility}.
|
||||
|
||||
\begin{figure}
|
||||
\centering
|
||||
\includegraphics[width=0.85\linewidth]{figs/QA_sample_synthesizing.png}
|
||||
\caption{QA sample synthesizing.}
|
||||
\label{fig:QA_sample_synthesizing}
|
||||
\end{figure}
|
||||
|
||||
\textbf{Quality Control}:
|
||||
In order to obtain a high-quality dataset, we performed very detailed quality control during the collection process. In the interface, we highlight the annotated POIs and timestamps with special fonts to help annotators identify them. We assign each sample to multiple workers at the same time, and let them score the data quality without knowing each other. If the negative score is higher than 60\%, the sample will be removed. In the final verification step, about 20\% of the records were modified, and we finally obtained 5,417,335 high-quality data samples.
|
||||
\section{Models}
|
||||
|
||||
In this section, we first present the formal problem definition for POI prediction at travel destinations.
|
||||
We then introduce the models used to evaluate the proposed dataset.
|
||||
|
||||
\subsection{Learning Problem}
|
||||
|
||||
Here we formally define the problem setup.
|
||||
The model is given a set of POI information $D_{poi} = \{ poi_1, \cdots, poi_N \}$, and questions $Q = \{ q_1, \cdots, q_M \}$, where each POI information $poi_i, i \in [N]$ and question $q_j, j \in [M]$ is a textual sequence of fewer than 8,000 tokens.
|
||||
The model must possess the following capabilities:
|
||||
i) Semantic Understanding: Accurately interpret user queries to identify intent and relevant context.
|
||||
ii) Information Retrieval: Efficiently search through $D_{poi}$ to extract pertinent POI data based on query requirements.
|
||||
iii) Spatiotemporal Analysis: Incorporate location and time-based constraints to effectively filter and rank candidate POIs. %spatial and temporal constraints?
|
||||
iv) Human-Computer Interaction: Generate responses that are not only accurate but also presented in a user-friendly manner, ensuring clarity and relevance.
|
||||
The model's objective is to generate a response string $\hat{A}$ that accurately answers the query by leveraging these capabilities. This involves selecting the most appropriate POI(s) from $D_{poi}$ based on the query's context and constraints, while maintaining a balance between precision and user experience.
|
||||
The approach integrates natural language processing techniques with spatiotemporal reasoning to achieve robust performance across diverse scenarios.
|
||||
\subsection{Pre-trained LLMs with SFT and RAG}
|
||||
|
||||
To cope with the existing challenges, especially the four capabilities mentioned in the previous paragraph, we adopt two open-source LLMs as base-models: Llama3.1~\cite{grattafiori2024llama} and Qwen2.5~\cite{yang2024qwen2}, which are known to achieve state-of-the-art performance on a wide range of open-world QA tasks (\eg Natural Question~\cite{kwiatkowski2019natural}, TriviaQA~\cite{joshi2017triviaqa}, and WikiQA~\cite{yang2015wikiqa}).
|
||||
|
||||
Llama3.1 and Qwen2.5 are both built with transformer-based decoder architectures with support for a 128K context length.
|
||||
Llama3.1 introduces Group Query Attention and follows a pretraining pipeline consisting of reward modeling, supervised fine-tuning (SFT), and direct preference optimization (DPO), while Qwen2.5 adopts a two-stage pretraining strategy with RoPE adjusted base frequency (ABF) technology and enhanced Chinese language support.
|
||||
Appendix~\ref{app:basemodel} provides a detailed description of their design and training process.
|
||||
|
||||
Beyond evaluating model performance on \name\ in a zero-shot setting, we also employ Low-Rank Adaptation (LoRA) fine-tuning \cite{hu2022lora} and Retrieval-Augmented Generation (RAG) \cite{lewis2020retrieval} methods for further assessment. More details are provided in Appendix~\ref{app:LoRA} and \ref{app:RAG}.
|
||||
|
||||
\section{Experiments}
|
||||
|
||||
In this section, we conduct several baseline experiments to better illustrate our proposed dataset.
|
||||
|
||||
\subsection{Experimental Setup}
|
||||
Experiments are conducted using two state-of-the-art LLMs as base models that we mentioned before: Llama3.1-8B and Qwen2.5-7B.
|
||||
For the Llama model, we use the English version of the dataset, while the Qwen model uses the Chinese version to generate the best results.
|
||||
The content of the two versions of the dataset is exactly the same except for languages.
|
||||
Additionally, we employ one specialized model, Deepseek-r1-32B for fine-grained task decomposition and retrieval results summarization and final generation in the RAG pipeline, as detailed in both the Models section and the Appendix~\ref{app:RAG}.
|
||||
We evaluate multiple model variants to analyze the impact of different methods on spatiotemporal reasoning capabilities, including zero-shot, LoRA-based fine-tuning, retrieval-augmented generation (RAG), and a combined LoRA+RAG method.
|
||||
|
||||
We utilize a mixed precision training strategy with bf16 to fine-tune all the models using the AdamW optimizer with a learning rate of 1e-4 and a cosine scheduler.
|
||||
For LoRA-based methods, the rank is set to 16. Models are fine-tuned for 3 epochs, using a batch size of 24 per GPU.
|
||||
The best model is selected based on validation set performance which is constituted from 10\% of the total dataset.
|
||||
All training is conducted on NVIDIA A100 with 80G memory running Ubuntu 22.04.
|
||||
|
||||
\subsection{Evaluation Metrics}
|
||||
|
||||
We evaluate model performance on four answer types: POI name, subcategory, medium category, and major category, covering spatiotemporal reasoning at multiple granularities.
|
||||
We designed two evaluation settings differing in how the answer space is defined: \textbf{QA for Classification Tasks} and \textbf{Open-world Generative QA}.
|
||||
For both settings, we report Hit Ratio (HR@$k$) and Normalized Discounted Cumulative Gain (NDCG@$k$) at $k\!\in\!\{5,10,20\}$.
|
||||
For the generative setting, we additionally compute BLEU-based textual-similarity scores to assess lexical quality.
|
||||
Detailed metric definitions are provided in Appendix~\ref{app:metrics}.
|
||||
|
||||
\subsection{Main Results}
|
||||
\label{exp:main_results}
|
||||
|
||||
|
||||
Tables \ref{tab:classification_hr}--\ref{tab:generation_results}
|
||||
summarize the primary results across model variants and metrics for the classification tasks and open-world generative QA tasks, respectively.
|
||||
Each table reports the performance of the base LLMs, Qwen2.5-7B and Llama3.1-8B, under four experimental configurations: zero-shot, LoRA-based fine-tuning, RAG, and combined RAG+LoRA.
|
||||
|
||||
\paragraph{QA for Classification tasks.}
|
||||
As shown in both Tables~\ref{tab:classification_hr} and \ref{tab:classification_ndcg}, zero-shot performance is consistently low, confirming that spatiotemporal reasoning remains challenging for out-of-the-box LLMs.
|
||||
LoRA and RAG can both enhance model performance.
|
||||
Taking $k=10$ as an example,
|
||||
LoRA contributes an improvement of 0.05 and 0.09 in HR@10 on average for Llama and Qwe, whereas RAG, through the integration of external spatiotemporal knowledge, achieves a sightly larger gain of 0.06 and 0.13.
|
||||
When combined, RAG + LoRA obtains the best result, outperforming the zero-shot baseline 2.5 and 3.9 times on HR@10 and NDCG@10, respectively.
|
||||
|
||||
\begin{table}[ht]
|
||||
\centering
|
||||
\caption{Results for classification tasks. We report HR@\{5,10,20\} for each model variant.}
|
||||
\label{tab:classification_hr}
|
||||
\small
|
||||
\resizebox{1.\linewidth}{!}{
|
||||
\begin{tabular}{l|ccc|ccc|ccc}
|
||||
\toprule
|
||||
\multirow{2}{*}{\textbf{Model}}
|
||||
& \multicolumn{3}{c|}{\textbf{Major Category}}
|
||||
& \multicolumn{3}{c|}{\textbf{Medium Category}}
|
||||
& \multicolumn{3}{c}{\textbf{Subcategory}} \\
|
||||
\cmidrule(lr){2-4} \cmidrule(lr){5-7} \cmidrule(lr){8-10}
|
||||
& \textbf{{\color{white}{H}}HR@5{\color{white}{H}}} & \textbf{{\color{white}{H}}HR@10{\color{white}{H}}} & \textbf{{\color{white}{H}}HR@20{\color{white}{H}}}
|
||||
& \textbf{{\color{white}{H}}HR@5{\color{white}{H}}} & \textbf{{\color{white}{H}}HR@10{\color{white}{H}}} & \textbf{{\color{white}{H}}HR@20{\color{white}{H}}}
|
||||
& \textbf{{\color{white}{H}}HR@5{\color{white}{H}}} & \textbf{{\color{white}{H}}HR@10{\color{white}{H}}} & \textbf{{\color{white}{H}}HR@20{\color{white}{H}}}\\
|
||||
\midrule
|
||||
Llama3.1-8B (zero-shot)
|
||||
& 0.0664 & 0.1001 & 0.0917
|
||||
& 0.0281 & 0.0481 & 0.0695
|
||||
& 0.0222 & 0.0350 & 0.0372 \\
|
||||
Qwen2.5-7B (zero-shot)
|
||||
& 0.1017 & 0.1775 & 0.1650
|
||||
& 0.0451 & 0.0784 & 0.0814
|
||||
& 0.0263 & 0.0467 & 0.0673 \\
|
||||
\midrule
|
||||
Llama3.1-8B (LoRA)
|
||||
& 0.1239 & 0.1880 & 0.2067
|
||||
& 0.0590 & 0.1041 & 0.1241
|
||||
& 0.0445 & 0.0687 & 0.0797 \\
|
||||
Qwen2.5-7B (LoRA)
|
||||
& 0.1950 & 0.3222 & 0.3509
|
||||
& 0.1004 & 0.1627 & 0.1871
|
||||
& 0.0611 & 0.1062 & 0.1250 \\
|
||||
\midrule
|
||||
Llama3.1-8B (RAG)
|
||||
& 0.1237 & 0.1770 & 0.2089
|
||||
& 0.0593 & 0.1155 & 0.1328
|
||||
& 0.0461 & 0.0721 & 0.0848 \\
|
||||
Qwen2.5-7B (RAG)
|
||||
& 0.2099 & \underline{0.3821} & 0.3815
|
||||
& 0.0967 & 0.1876 & 0.2008
|
||||
& 0.0650 & 0.1107 & 0.1218 \\
|
||||
\midrule
|
||||
Llama3.1-8B (RAG+LoRA)
|
||||
& \underline{0.2189} & 0.3784 & \underline{0.4356}
|
||||
& \underline{0.1736} & \underline{0.2966} & \underline{0.3379}
|
||||
& \underline{0.1092} & \underline{0.2009} & \underline{0.2324} \\
|
||||
Qwen2.5-7B (RAG+LoRA)
|
||||
& \textbf{0.2339} & \textbf{0.4062} & \textbf{0.4698}
|
||||
& \textbf{0.1812} & \textbf{0.2987} & \textbf{0.3577}
|
||||
& \textbf{0.1288} & \textbf{0.2185} & \textbf{0.2586} \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
}
|
||||
|
||||
\small{
|
||||
Bold and underlined indicates statistically significant improvement \\(\ie using a two-sided t-test with $p<0.05$) over the best baseline.
|
||||
}
|
||||
\end{table}
|
||||
|
||||
\begin{table}[ht]
|
||||
\centering
|
||||
\caption{Results for classification tasks. We report NDCG@\{5,10,20\} for each model variant.}
|
||||
\label{tab:classification_ndcg}
|
||||
\small
|
||||
\resizebox{1.\linewidth}{!}{
|
||||
\begin{tabular}{l|ccc|ccc|ccc}
|
||||
\toprule
|
||||
\multirow{2}{*}{\textbf{Model}}
|
||||
& \multicolumn{3}{c|}{\textbf{Major Category}}
|
||||
& \multicolumn{3}{c|}{\textbf{Medium Category}}
|
||||
& \multicolumn{3}{c}{\textbf{Subcategory}} \\
|
||||
\cmidrule(lr){2-4} \cmidrule(lr){5-7} \cmidrule(lr){8-10}
|
||||
& \textbf{NDCG@5} & \textbf{NDCG@10} & \textbf{NDCG@20}
|
||||
& \textbf{NDCG@5} & \textbf{NDCG@10} & \textbf{NDCG@20}
|
||||
& \textbf{NDCG@5} & \textbf{NDCG@10} & \textbf{NDCG@20}\\
|
||||
|
||||
\midrule
|
||||
Llama3.1-8B (zero-shot)
|
||||
& 0.1073 & 0.1841 & 0.2150
|
||||
& 0.0617 & 0.1241 & 0.1380
|
||||
& 0.0631 & 0.0842 & 0.1141 \\
|
||||
Qwen2.5-7B (zero-shot)
|
||||
& 0.1778 & 0.3130 & 0.3521
|
||||
& 03.1047 & 0.1736 & 0.2369
|
||||
& 0.0910 & 0.1319 & 0.1642 \\
|
||||
\midrule
|
||||
Llama3.1-8B (LoRA)
|
||||
& 0.2085 & 0.3448 & 0.3948
|
||||
& 0.1284 & 0.2268 & 0.2646
|
||||
& 0.1182 & 0.1959 & 0.2247 \\
|
||||
Qwen2.5-7B (LoRA)
|
||||
& 0.3555 & 0.5694 & 0.6976
|
||||
& 0.1968 & 0.3479 & 0.4270
|
||||
& 0.1898 & 0.2804 & 0.3241 \\
|
||||
\midrule
|
||||
Llama3.1-8B (RAG)
|
||||
& 0.2436 & 0.3911 & 0.4029
|
||||
& 0.1319 & 0.2530 & 0.2857
|
||||
& 0.1304 & 0.2075 & 0.2245 \\
|
||||
Qwen2.5-7B (RAG)
|
||||
& 0.3550 & 0.6315 & 0.6790
|
||||
& 0.2121 & 0.3655 & 0.4646
|
||||
& 0.1879 & 0.2808 & 0.3250 \\
|
||||
\midrule
|
||||
Llama3.1-8B (RAG+LoRA)
|
||||
& \underline{0.4722} & \underline{0.6940} & \underline{0.7363}
|
||||
& \underline{0.3512} & \underline{0.6464} & \underline{0.7485}
|
||||
& \underline{0.3512} & \underline{0.5729} & \underline{0.6595} \\
|
||||
Qwen2.5-7B (RAG+LoRA)
|
||||
& \textbf{0.4615} & \textbf{0.7179} & \textbf{0.8307}
|
||||
& \textbf{0.3699} & \textbf{0.6388} & \textbf{0.7118}
|
||||
& \textbf{0.3143} & \textbf{0.5767} & \textbf{0.6822} \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
}
|
||||
|
||||
\small{
|
||||
Bold and underlined indicates statistically significant improvement \\(\ie using a two-sided t-test with $p<0.05$) over the best baseline.
|
||||
}
|
||||
\end{table}
|
||||
|
||||
\begin{table}[ht]
|
||||
\centering
|
||||
\caption{Open-world Generative QA results.
|
||||
Besides HR@\{5,10,20\} and NDCG@\{5,10,20\}, we include BERTScore\textsubscript{F1} (“BLEUScore” column) to measure lexical similarity.}
|
||||
\label{tab:generation_results}
|
||||
\small
|
||||
\resizebox{1.\linewidth}{!}{
|
||||
\begin{tabular}{l|ccc|ccc|c}
|
||||
\toprule
|
||||
\multirow{2}{*}{\textbf{Model}}
|
||||
& \multicolumn{3}{c|}{\textbf{Hit Ratio (Full Match)}}
|
||||
& \multicolumn{3}{c|}{\textbf{NDCG (Full Match)}}
|
||||
& \multirow{2}{*}{\textbf{BLEUScore}}
|
||||
\\
|
||||
\cmidrule(lr){2-4} \cmidrule(lr){5-7}
|
||||
& \textbf{HR@5} & \textbf{HR@10} & \textbf{HR@20}
|
||||
& \textbf{NDCG@5} & \textbf{NDCG@10} & \textbf{NDCG@20} \\
|
||||
|
||||
\midrule
|
||||
Llama3.1-8B (zero-shot)
|
||||
& 0.0075 & 0.0112 & 0.0146
|
||||
& 0.0149 & 0.0244 & 0.0297
|
||||
& 0.0332 \\
|
||||
Qwen2.5-7B (zero-shot)
|
||||
& 0.0119 & 0.0199 & 0.0234
|
||||
& 0.0213 & 0.0390 & 0.0442
|
||||
& 0.0254 \\
|
||||
\midrule
|
||||
Llama3.1-8B (LoRA)
|
||||
& 0.0144 & 0.0241 & 0.0282
|
||||
& 0.0320 & 0.0512 & 0.0589
|
||||
& 0.2941 \\
|
||||
Qwen2.5-7B (LoRA)
|
||||
& 0.0220 & 0.0394 &0.0459
|
||||
& 0.0464 & 0.0798 & 0.0940
|
||||
& 0.3082 \\
|
||||
\midrule
|
||||
Llama3.1-8B (RAG)
|
||||
& 0.0142 & 0.0232 & 0.0294
|
||||
& 0.0338 & 0.0537 & 0.0640
|
||||
& 0.4125 \\
|
||||
Qwen2.5-7B (RAG)
|
||||
& 0.0226 & 0.0441 & 0.0496
|
||||
& 0.0484 & 0.0850 & 0.1048
|
||||
& 0.5321 \\
|
||||
\midrule
|
||||
Llama3.1-8B (RAG+LoRA)
|
||||
& \underline{0.0331} & \underline{0.0584} & \underline{0.0690}
|
||||
& \underline{0.0725} & \underline{0.1276} & \textbf{0.1509}
|
||||
& \underline{0.7729} \\
|
||||
Qwen2.5-7B (RAG+LoRA)
|
||||
& \textbf{0.0394} & \textbf{0.0616} & \textbf{0.0714}
|
||||
& \textbf{0.0770} & \textbf{0.1289} & \underline{0.1508}
|
||||
& \textbf{0.7911} \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
}
|
||||
|
||||
\small{
|
||||
Bold and underlined indicates statistically significant improvement \\(\ie using a two-sided t-test with $p<0.05$) over the best baseline.
|
||||
}
|
||||
\end{table}
|
||||
|
||||
\paragraph{Open-world Generative QA.}
|
||||
This task poses a greater challenge, as models are required not only to reason over complex spatiotemporal constraints but also to generate accurately formatted POI names.
|
||||
Taking $k=10$ as an instance,
|
||||
in the zero-shot setting, HR@10 drops to 0.0075 for Llama and 0.0119 for Qwen, and even the best-performing configuration, RAG combined with LoRA, achieves only 0.06 for HR@10 on average and 0.1283 for NDCG@10 on average.
|
||||
|
||||
Despite the difficulty, both LoRA and RAG contribute positively.
|
||||
LoRA increases HR@10 by almost 100\%, RAG provides an additional improvement of about 110\%, and their combination yields a total gain of 6 times than the zero-shot setting.
|
||||
While the strict ranking metrics remain relatively low, the BLEUScore maintains relatively high when combining with RAG \& LoRA approaches, indicating that the generated outputs are often semantically similar to the label even when they do not match exactly.
|
||||
This finding highlights the necessity of controlling hallucination and ensuring accurate outputs in generative spatiotemporal QA tasks.
|
||||
However, the differentiated results also indicate that the proposed dataset requires a more precise spatiotemporal relationship analysis modeling method to improve its accuracy.
|
||||
|
||||
\begin{table}[ht]
|
||||
\centering
|
||||
\caption{Performance on the human-paraphrased subset of \name.}
|
||||
\label{tab:human_results}
|
||||
\small
|
||||
\resizebox{1.\linewidth}{!}{
|
||||
\begin{tabular}{l|ccc|ccc|c}
|
||||
\toprule
|
||||
\multicolumn{1}{l}{\multirow{2}{*}{\textbf{Task}}}
|
||||
& \multicolumn{3}{c}{\textbf{Hit Ratio}}
|
||||
& \multicolumn{3}{c}{\textbf{NDCG}}
|
||||
& \multirow{2}{*}{\textbf{BLEUScore}}
|
||||
\\
|
||||
\cmidrule(lr){2-4} \cmidrule(lr){5-7}
|
||||
& \textbf{HR@5} & \textbf{HR@10} & \textbf{HR@20}
|
||||
& \textbf{NDCG@5} & \textbf{NDCG@10} & \textbf{NDCG@20} \\
|
||||
|
||||
\midrule
|
||||
Classification: Major Category
|
||||
& 0.3493 & 0.5644 & 0.6701
|
||||
& 0.6518 & 0.7774 & 0.8432
|
||||
& - \\
|
||||
Classification: Medium Category
|
||||
& 0.2891 & 0.4150 & 0.4693
|
||||
& 0.5119 & 0.6875 & 0.7861
|
||||
& - \\
|
||||
Classification: Subcategory
|
||||
& 0.1833 & 0.3035 & 0.3481
|
||||
& 0.4411 & 0.6012 & 0.7140
|
||||
& - \\
|
||||
\midrule
|
||||
Generation:\quad\ POI Names
|
||||
& 0.1548 & 0.1611 & 0.1984
|
||||
& 0.2096 & 0.2667 & 0.2924
|
||||
& 0.8655 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
}
|
||||
|
||||
\small{
|
||||
Bold and underlined indicates statistically significant improvement \\(\ie using a two-sided t-test with $p<0.05$) over the best baseline.
|
||||
}
|
||||
\end{table}
|
||||
|
||||
\subsection{Human-Paraphrased Results}
|
||||
\label{exp:human_para}
|
||||
|
||||
To assess how well the models generalize to natural user queries, we asked crowd-workers to paraphrase $N_{\text{para}}{=}1{,}000$ questions in \name's test data.
|
||||
Table~\ref{tab:human_results} reports the results for the zero-shot and the best baseline RAG+LoRA.
|
||||
Besides we report the result of the model finetuned on RAG+LoRA.
|
||||
Across the two base LLMs, the performance drop from template to paraphrased questions is quite significant, roughly 70\% on HR on average and 85\% on NDCG on average.
|
||||
|
||||
\section{Related Work}
|
||||
|
||||
\subsection{POI-related QA}
|
||||
In recent years, many works have been proposed on POI-related tasks, particularly with the rise of location-based services.
|
||||
Early datasets often involved retrieving factual data from structured knowledge bases or user-generated content.
|
||||
For instance,
|
||||
POIReviewQA~\cite{mai2018poireviewqa} have been proposed to support open-domain search and QA by using Yelp reviews.
|
||||
Tourism Reviews are also involved in building POI recommendation questions~\cite{contractor2021answering}.
|
||||
More recently, MapQA~\cite{li2025mapqa} focuses on open-domain QA on geospatial entities and relationships, using geospatial data as the reference.
|
||||
|
||||
|
||||
While these datasets advance POI-related QA by leveraging user reviews and geospatial data, they primary focus on knowledge extraction from static information or direct user preference modeling, rather than systematically evaluating a model's spatiotemporal reasoning capabilities. Thus, we hope our dataset could serve as a complement to the existing POI-related QA research.
|
||||
|
||||
\subsection{Spatiotemporal Reasoning}
|
||||
Spatiotemporal reasoning, which involves understanding and making inferences based on the combined dimensions of space and time, is crucial for many AI applications. In NLP and QA, several efforts have targeted temporal reasoning.
|
||||
For example, recent datasets like TempQuestions~\cite{jia2018tempquestions} and the ComplexTempQA~\cite{gruber2024complextempqa} specifically focus on temporal question answering, with the latter tackling complex queries requiring across-time comparison and multi-hop temporal reasoning. On the spatial side, datasets like MapQA~\cite{li2025mapqa} evaluate the performance of geospatial reasoning by using map data directly.
|
||||
|
||||
However, many of these datasets treat temporal and spatial aspects with a primary focus on one or the other. \name~aims to fill this gap by providing QA that explicitly considers spatiotemporal dependency in the context of POI trajectories.
|
||||
|
||||
\subsection{Spatiotemporal Foundation LLMs}
|
||||
|
||||
LLMs have strong capabilities in general question answering, but there is still much room for spatiotemporal reasoning in specific dynamic real-world scenarios.
|
||||
Recently, research has increasingly focused on specialized adaptations to improve LLM's spatiotemporal understanding and reasoning.
|
||||
For instance, the CityGPT~\cite{feng2024citygpt} aims to empower the urban spatial cognition of LLMs by fine-tuning them with a specially constructed instruction dataset, CityInstruction, to introduce urban knowledge and enhance spatial reasoning for city-scale tasks. BIGCity~\cite{yu2024bigcity} proposes a universal spatiotemporal model for a unified analysis of diverse spatiotemporal data types.
|
||||
|
||||
Besides, benchmarks like STBench~\cite{li2024stbench} assess LLMs on a range of spatio-temporal tasks, including knowledge comprehension, spatio-temporal reasoning, accurate computation, and downstream applications.
|
||||
Our \name~highlights the spatiotemporal-sensitive questions for evaluating models' spatiotemporal reasoning.
|
||||
\section{Conclusion}
|
||||
|
||||
In this paper, we explored the importance of spatiotemporal reasoning in real-world tasks.
|
||||
We highlighted the limitations of existing QA datasets illustrating spatiotemporal-sensitive questions and introduced a novel dataset called \name\ to address these challenges.
|
||||
This dataset incorporates real-world de-privacy trajectory data and extensive human annotations, providing a comprehensive resource for evaluating spatiotemporal reasoning capabilities.
|
||||
|
||||
Our analysis revealed significant performance drops in state-of-the-art models on refined POI prediction tasks, underscoring the need for improved spatiotemporal understanding. With its unique features, including bilingual support and diverse granularities, \name\ serves as a valuable benchmark for advancing research in intelligent recommendation systems. We believe it will play a pivotal role in developing more accurate and context-aware solutions for real-world applications.
|
||||
Reference in New Issue
Block a user