Graduate/mypaper/KDD2026_DyPAM.bib

% Related

@article{lialin2023scaling,
  title={Scaling down to scale up: A guide to parameter-efficient fine-tuning},
  author={Lialin, Vladislav and Deshpande, Vijeta and Rumshisky, Anna},
  journal={arXiv preprint arXiv:2303.15647},
  year={2023}
}
% SDCTFT
@article{shen2024parameter,
  title={Parameter-efficient fine-tuning via selective discrete cosine transform},
  author={Shen, Yixian and Bi, Qi and Huang, Jia-Hong and Zhu, Hongyi and Pathania, Anuj},
  journal={arXiv preprint arXiv:2410.09103},
  year={2024}
}


% FourierFT
@article{gao2024parameter,
  title={Parameter-efficient fine-tuning with discrete fourier transform},
  author={Gao, Ziqi and Wang, Qichao and Chen, Aochuan and Liu, Zijing and Wu, Bingzhe and Chen, Liang and Li, Jia},
  journal={arXiv preprint arXiv:2405.03003},
  year={2024}
}
@article{hu2025waveletft,
  title={WaveletFT: Discrete wavelet transform for parameter-efficient fine-tuning},
  author={Hu, Can and Yang, Jie and Song, Shien and Fan, Wentao and Xie, Tao},
  journal={Neurocomputing},
  pages={130765},
  year={2025},
  publisher={Elsevier}
}
% Little Wavelet
@article{bilican2025exploring,
  title={Exploring Sparsity for Parameter Efficient Fine Tuning Using Wavelets},
  author={Bilican, Ahmet and Y{\i}lmaz, M Ak{\i}n and Tekalp, A Murat and Cinbi{\c{s}}, R G{\"o}kberk},
  journal={arXiv preprint arXiv:2505.12532},
  year={2025}
}
@article{zhang2025f,
  title={F-Adapter: Frequency-Adaptive Parameter-Efficient Fine-Tuning in Scientific Machine Learning},
  author={Zhang, Hangwei and Kang, Chun and Wang, Yan and Zou, Difan},
  journal={arXiv preprint arXiv:2509.23173},
  year={2025}
}


% LoCA
@article{du2025loca,
  title={LoCA: Location-Aware Cosine Adaptation for Parameter-Efficient Fine-Tuning},
  author={Du, Zhekai and Min, Yinjie and Li, Jingjing and Lu, Ke and Zou, Changliang and Peng, Liuhua and Chu, Tingjin and Gong, Mingming},
  journal={arXiv preprint arXiv:2502.06820},
  year={2025}
}
% Flylora
@article{zou2025flylora,
  title={FlyloRA: Boosting task decoupling and parameter efficiency via implicit rank-wise mixture-of-experts},
  author={Zou, Heming and Zang, Yunliang and Xu, Wutong and Zhu, Yao and Ji, Xiangyang},
  journal={arXiv preprint arXiv:2510.08396},
  year={2025}
}


% LLM
@misc{qwen3technicalreport,
      title={Qwen3 Technical Report},
      author={Qwen Team},
      year={2025},
      eprint={2505.09388},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2505.09388},
}
@article{grattafiori2024llama,
  title={The llama 3 herd of models},
  author={Grattafiori, Aaron and Dubey, Abhimanyu and Jauhri, Abhinav and Pandey, Abhinav and Kadian, Abhishek and Al-Dahle, Ahmad and Letman, Aiesha and Mathur, Akhil and Schelten, Alan and Vaughan, Alex and others},
  journal={arXiv preprint arXiv:2407.21783},
  year={2024}
}
@article{gemma_2025,
    title={Gemma 3},
    url={https://goo.gle/Gemma3Report},
    publisher={Kaggle},
    author={Gemma Team},
    year={2025}
}

@article{voita2019analyzing,
  title={Analyzing multi-head self-attention: Specialized heads do the heavy lifting, the rest can be pruned},
  author={Voita, Elena and Talbot, David and Moiseev, Fedor and Sennrich, Rico and Titov, Ivan},
  journal={arXiv preprint arXiv:1905.09418},
  year={2019}
}

@article{zhang2022mixture,
  title={Mixture of attention heads: Selecting attention heads per token},
  author={Zhang, Xiaofeng and Shen, Yikang and Huang, Zeyu and Zhou, Jie and Rong, Wenge and Xiong, Zhang},
  journal={arXiv preprint arXiv:2210.05144},
  year={2022}
}


@article{gu2025unpacking,
  title={Unpacking Positional Encoding in Transformers: A Spectral Analysis of Content-Position Coupling},
  author={Gu, Zihan and Zhang, Han and Chen, Ruoyu and Hu, Yue and Zhang, Hua},
  journal={arXiv preprint arXiv:2505.13027},
  year={2025}
}

@inproceedings{yu2025comrope,
  title={ComRoPE: Scalable and Robust Rotary Position Embedding Parameterized by Trainable Commuting Angle Matrices},
  author={Yu, Hao and Jiang, Tangyu and Jia, Shuning and Yan, Shannan and Liu, Shunning and Qian, Haolong and Li, Guanghao and Dong, Shuting and Yuan, Chun},
  booktitle={Proceedings of the Computer Vision and Pattern Recognition Conference},
  pages={4508--4517},
  year={2025}
}

@article{raganato2020fixed,
  title={Fixed encoder self-attention patterns in transformer-based machine translation},
  author={Raganato, Alessandro and Scherrer, Yves and Tiedemann, J{\"o}rg},
  journal={arXiv preprint arXiv:2002.10260},
  year={2020}
}


% IJCAI
@article{han2024parameter,
  title={Parameter-efficient fine-tuning for large models: A comprehensive survey},
  author={Han, Zeyu and Gao, Chao and Liu, Jinyang and Zhang, Jeff and Zhang, Sai Qian},
  journal={arXiv preprint arXiv:2403.14608},
  year={2024}
}
@article{pan2025rosa,
  title={RoSA: Enhancing Parameter-Efficient Fine-Tuning via RoPE-aware Selective Adaptation in Large Language Models},
  author={Pan, Dayan and Wang, Jingyuan and Zhou, Yilong and Cheng, Jiawei and Jia, Pengyue and Zhao, Xiangyu},
  journal={arXiv preprint arXiv:2511.21733},
  year={2025}
}

@inproceedings{shiracite,
author = {Bhardwaj, Kartikeya and Pandey, Nilesh Prasad and Priyadarshi, Sweta and Ganapathy, Viswanath and Kadambi, Shreya and Esteves, Rafael and Borse, Shubhankar and Whatmough, Paul and Garrepalli, Risheek and Van Baalen, Mart and Teague, Harris and Nagel, Markus},
title = {Sparse high rank adapters},
year = {2024},
isbn = {9798331314385},
publisher = {Curran Associates Inc.},
address = {Red Hook, NY, USA},
booktitle = {Proceedings of the 38th International Conference on Neural Information Processing Systems},
articleno = {438},
numpages = {31},
location = {Vancouver, BC, Canada},
series = {NIPS '24}
}
@article{hu2021lora,
  title={Lora: Low-rank adaptation of large language models},
  author={Hu, Edward J and Shen, Yelong and Wallis, Phillip and Allen-Zhu, Zeyuan and Li, Yuanzhi and Wang, Shean and Wang, Lu and Chen, Weizhu},
  journal={arXiv preprint arXiv:2106.09685},
  year={2021}
}

% adapter
@inproceedings{houlsby2019parameter,
  title={Parameter-efficient transfer learning for NLP},
  author={Houlsby, Neil and Giurgiu, Andrei and Jastrzebski, Stanislaw and Morrone, Bruna and De Laroussilhe, Quentin and Gesmundo, Andrea and Attariyan, Mona and Gelly, Sylvain},
  booktitle={International conference on machine learning},
  pages={2790--2799},
  year={2019},
  organization={PMLR}
}

% AAAING

% Datasets
% GSM8K
@article{cobbe2021training,
  title={Training verifiers to solve math word problems},
  author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and others},
  journal={arXiv preprint arXiv:2110.14168},
  year={2021}
}
% SVAMP
@article{patel2021nlp,
  title={Are NLP models really able to solve simple math word problems?},
  author={Patel, Arkil and Bhattamishra, Satwik and Goyal, Navin},
  journal={arXiv preprint arXiv:2103.07191},
  year={2021}
}
% MultiArith
@article{roy2016solving,
  title={Solving general arithmetic word problems},
  author={Roy, Subhro and Roth, Dan},
  journal={arXiv preprint arXiv:1608.01413},
  year={2016}
}
% Addsub
@inproceedings{hosseini2014learning,
  title={Learning to solve arithmetic word problems with verb categorization},
  author={Hosseini, Mohammad Javad and Hajishirzi, Hannaneh and Etzioni, Oren and Kushman, Nate},
  booktitle={Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP)},
  pages={523--533},
  year={2014}
}
% AQuA
@article{ling2017program,
  title={Program induction by rationale generation: Learning to solve and explain algebraic word problems},
  author={Ling, Wang and Yogatama, Dani and Dyer, Chris and Blunsom, Phil},
  journal={arXiv preprint arXiv:1705.04146},
  year={2017}
}
% SingleEq
@article{koncel2015parsing,
  title={Parsing algebraic word problems into equations},
  author={Koncel-Kedziorski, Rik and Hajishirzi, Hannaneh and Sabharwal, Ashish and Etzioni, Oren and Ang, Siena Dumas},
  journal={Transactions of the Association for Computational Linguistics},
  volume={3},
  pages={585--597},
  year={2015},
  publisher={MIT Press One Rogers Street, Cambridge, MA 02142-1209, USA journals-info~…}
}
% MAWPS
@inproceedings{koncel2016mawps,
  title={MAWPS: A math word problem repository},
  author={Koncel-Kedziorski, Rik and Roy, Subhro and Amini, Aida and Kushman, Nate and Hajishirzi, Hannaneh},
  booktitle={Proceedings of the 2016 conference of the north american chapter of the association for computational linguistics: human language technologies},
  pages={1152--1157},
  year={2016}
}
% BoolQ
@article{clark2019boolq,
  title={Boolq: Exploring the surprising difficulty of natural yes/no questions},
  author={Clark, Christopher and Lee, Kenton and Chang, Ming-Wei and Kwiatkowski, Tom and Collins, Michael and Toutanova, Kristina},
  journal={arXiv preprint arXiv:1905.10044},
  year={2019}
}
% PIQA
@inproceedings{bisk2020piqa,
  title={Piqa: Reasoning about physical commonsense in natural language},
  author={Bisk, Yonatan and Zellers, Rowan and Gao, Jianfeng and Choi, Yejin and others},
  booktitle={Proceedings of the AAAI conference on artificial intelligence},
  volume={34},
  number={05},
  pages={7432--7439},
  year={2020}
}
% SIQA
@article{sap2019socialiqa,
  title={Socialiqa: Commonsense reasoning about social interactions},
  author={Sap, Maarten and Rashkin, Hannah and Chen, Derek and LeBras, Ronan and Choi, Yejin},
  journal={arXiv preprint arXiv:1904.09728},
  year={2019}
}
% HW
@article{zellers2019hellaswag,
  title={Hellaswag: Can a machine really finish your sentence?},
  author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin},
  journal={arXiv preprint arXiv:1905.07830},
  year={2019}
}
% WN
@inproceedings{sakaguchi2020winogrande,
  title={Winogrande: An adversarial winograd schema challenge at scale},
  author={Sakaguchi, Keisuke and Le Bras, Ronan and Bhagavatula, Chandra and Choi, Yejin},
  booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
  volume={34},
  number={05},
  pages={8732--8740},
  year={2020}
}
% ARC
@article{clark2018think,
  title={Think you have solved question answering? try arc, the ai2 reasoning challenge},
  author={Clark, Peter and Cowhey, Isaac and Etzioni, Oren and Khot, Tushar and Sabharwal, Ashish and Schoenick, Carissa and Tafjord, Oyvind},
  journal={arXiv preprint arXiv:1803.05457},
  year={2018}
}
% OBDA
@article{mihaylov2018can,
  title={Can a suit of armor conduct electricity? a new dataset for open book question answering},
  author={Mihaylov, Todor and Clark, Peter and Khot, Tushar and Sabharwal, Ashish},
  journal={arXiv preprint arXiv:1809.02789},
  year={2018}
}

% Related

@article{li2021prefix,
  title={Prefix-tuning: Optimizing continuous prompts for generation},
  author={Li, Xiang Lisa and Liang, Percy},
  journal={arXiv preprint arXiv:2101.00190},
  year={2021}
}
@article{dong2025attention,
  title={Attention Retrieves, MLP Memorizes: Disentangling Trainable Components in the Transformer},
  author={Dong, Yihe and Noci, Lorenzo and Khodak, Mikhail and Li, Mufan},
  journal={arXiv preprint arXiv:2506.01115},
  year={2025}
}
@article{michel2019sixteen,
  title={Are sixteen heads really better than one?},
  author={Michel, Paul and Levy, Omer and Neubig, Graham},
  journal={Advances in neural information processing systems},
  volume={32},
  year={2019}
}
@article{belinkov2018evaluating,
  title={Evaluating layers of representation in neural machine translation on part-of-speech and semantic tagging tasks},
  author={Belinkov, Yonatan and M{\`a}rquez, Llu{\'\i}s and Sajjad, Hassan and Durrani, Nadir and Dalvi, Fahim and Glass, James},
  journal={arXiv preprint arXiv:1801.07772},
  year={2018}
}
% Others
@article{ding2023parameter,
  title={Parameter-efficient fine-tuning of large-scale pre-trained language models},
  author={Ding, Ning and Qin, Yujia and Yang, Guang and Wei, Fuchao and Yang, Zonghan and Su, Yusheng and Hu, Shengding and Chen, Yulin and Chan, Chi-Min and Chen, Weize and others},
  journal={Nature machine intelligence},
  volume={5},
  number={3},
  pages={220--235},
  year={2023},
  publisher={Nature Publishing Group UK London}
}
@article{peng2023instruction,
  title={Instruction tuning with gpt-4},
  author={Peng, Baolin and Li, Chunyuan and He, Pengcheng and Galley, Michel and Gao, Jianfeng},
  journal={arXiv preprint arXiv:2304.03277},
  year={2023}
}

% Baselines
@article{liu2024dora,
  title={Dora: Weight-decomposed low-rank adaptation},
  author={Liu, Shih-Yang and Wang, Chien-Yi and Yin, Hongxu and Molchanov, Pavlo and Wang, Yu-Chiang Frank and Cheng, Kwang-Ting and Chen, Min-Hung},
  journal={arXiv preprint arXiv:2402.09353},
  year={2024}
}

@article{zhang2023adalora,
  title={Adalora: Adaptive budget allocation for parameter-efficient fine-tuning},
  author={Zhang, Qingru and Chen, Minshuo and Bukharin, Alexander and Karampatziakis, Nikos and He, Pengcheng and Cheng, Yu and Chen, Weizhu and Zhao, Tuo},
  journal={arXiv preprint arXiv:2303.10512},
  year={2023}
}
% C3A
@article{chen2024parameter,
  title={Parameter-efficient fine-tuning via circular convolution},
  author={Chen, Aochuan and Cheng, Jiashun and Liu, Zijing and Gao, Ziqi and Tsung, Fugee and Li, Yu and Li, Jia},
  journal={arXiv preprint arXiv:2407.19342},
  year={2024}
}
% BONE
@article{kang2024balancing,
  title={Balancing LoRA Performance and Efficiency with Simple Shard Sharing},
  author={Kang, Jiale and Yin, Qingyu},
  journal={arXiv preprint arXiv:2409.15371},
  year={2024}
}
% VERA-EDITED
@article{kopiczko2023vera,
  title={Vera: Vector-based random matrix adaptation},
  author={{Kopiczko et al.}},
  journal={arXiv preprint arXiv:2310.11454},
  year={2023}
}
% OFT
@article{qiu2023controlling,
  title={Controlling text-to-image diffusion by orthogonal finetuning},
  author={Qiu, Zeju and Liu, Weiyang and Feng, Haiwen and Xue, Yuxuan and Feng, Yao and Liu, Zhen and Zhang, Dan and Weller, Adrian and Sch{\"o}lkopf, Bernhard},
  journal={Advances in Neural Information Processing Systems},
  volume={36},
  pages={79320--79362},
  year={2023}
}


% BOFT
@article{liu2023parameter,
  title={Parameter-efficient orthogonal finetuning via butterfly factorization},
  author={Liu, Weiyang and Qiu, Zeju and Feng, Yao and Xiu, Yuliang and Xue, Yuxuan and Yu, Longhui and Feng, Haiwen and Liu, Zhen and Heo, Juyeon and Peng, Songyou and others},
  journal={arXiv preprint arXiv:2311.06243},
  year={2023}
}
% IA3
@article{liu2022few,
  title={Few-shot parameter-efficient fine-tuning is better and cheaper than in-context learning},
  author={Liu, Haokun and Tam, Derek and Muqeeth, Mohammed and Mohta, Jay and Huang, Tenghao and Bansal, Mohit and Raffel, Colin A},
  journal={Advances in Neural Information Processing Systems},
  volume={35},
  pages={1950--1965},
  year={2022}
}

% LN-Tuning
@article{zhao2023tuning,
  title={Tuning layernorm in attention: Towards efficient multi-modal llm finetuning},
  author={Zhao, Bingchen and Tu, Haoqin and Wei, Chen and Mei, Jieru and Xie, Cihang},
  journal={arXiv preprint arXiv:2312.11420},
  year={2023}
}

% Deepspeed
@inproceedings{rasley2020deepspeed,
  title={Deepspeed: System optimizations enable training deep learning models with over 100 billion parameters},
  author={Rasley, Jeff and Rajbhandari, Samyam and Ruwase, Olatunji and He, Yuxiong},
  booktitle={Proceedings of the 26th ACM SIGKDD international conference on knowledge discovery \& data mining},
  pages={3505--3506},
  year={2020}
}
% Huggingface Transformers
@inproceedings{wolf2020transformers,
  title={Transformers: State-of-the-art natural language processing},
  author={Wolf, Thomas and Debut, Lysandre and Sanh, Victor and Chaumond, Julien and Delangue, Clement and Moi, Anthony and Cistac, Pierric and Rault, Tim and Louf, Remi and Funtowicz, Morgan and others},
  booktitle={Proceedings of the 2020 conference on empirical methods in natural language processing: system demonstrations},
  pages={38--45},
  year={2020}
}

@inproceedings{geva2021transformer,
  title={Transformer Feed-Forward Layers Are Key-Value Memories},
  author={Geva, Mor and Schuster, Roei and Berant, Jonathan and Levy, Omer},
  booktitle={Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing},
  pages={5484--5495},
  year={2021}
}
@article{meng2022mass,
  title={Mass-editing memory in a transformer},
  author={Meng, Kevin and Sharma, Arnab Sen and Andonian, Alex and Belinkov, Yonatan and Bau, David},
  journal={arXiv preprint arXiv:2210.07229},
  year={2022}
}
@article{clark2019does,
  title={What does bert look at? an analysis of bert's attention},
  author={Clark, Kevin and Khandelwal, Urvashi and Levy, Omer and Manning, Christopher D},
  journal={arXiv preprint arXiv:1906.04341},
  year={2019}
}

@article{su2024roformer,
  title={Roformer: Enhanced transformer with rotary position embedding},
  author={Su, Jianlin and Ahmed, Murtadha and Lu, Yu and Pan, Shengfeng and Bo, Wen and Liu, Yunfeng},
  journal={Neurocomputing},
  volume={568},
  pages={127063},
  year={2024},
  publisher={Elsevier}
}
@article{barbero2024round,
  title={Round and round we go! what makes rotary positional encodings useful?},
  author={Barbero, Federico and Vitvitskyi, Alex and Perivolaropoulos, Christos and Pascanu, Razvan and Veli{\v{c}}kovi{\'c}, Petar},
  journal={arXiv preprint arXiv:2410.06205},
  year={2024}
}

@article{jin2025massive,
  title={Massive Values in Self-Attention Modules are the Key to Contextual Knowledge Understanding},
  author={Jin, Mingyu and Mei, Kai and Xu, Wujiang and Sun, Mingjie and Tang, Ruixiang and Du, Mengnan and Liu, Zirui and Zhang, Yongfeng},
  journal={arXiv preprint arXiv:2502.01563},
  year={2025}
}
@article{vaswani2017attention,
  title={Attention is all you need},
  author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
  journal={Advances in neural information processing systems},
  volume={30},
  year={2017}
}
@article{touvron2023llama,
  title={Llama: Open and efficient foundation language models},
  author={Touvron, Hugo and Lavril, Thibaut and Izacard, Gautier and Martinet, Xavier and Lachaux, Marie-Anne and Lacroix, Timoth{\'e}e and Rozi{\`e}re, Baptiste and Goyal, Naman and Hambro, Eric and Azhar, Faisal and others},
  journal={arXiv preprint arXiv:2302.13971},
  year={2023}
}
@article{shazeer2020glu,
  title={Glu variants improve transformer},
  author={Shazeer, Noam},
  journal={arXiv preprint arXiv:2002.05202},
  year={2020}
}
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
@article{bai2023qwen,
  title={Qwen technical report},
  author={Bai, Jinze and Bai, Shuai and Chu, Yunfei and Cui, Zeyu and Dang, Kai and Deng, Xiaodong and Fan, Yang and Ge, Wenbin and Han, Yu and Huang, Fei and others},
  journal={arXiv preprint arXiv:2309.16609},
  year={2023}
}

% SiLU
@article{elfwing2018sigmoid,
  title={Sigmoid-weighted linear units for neural network function approximation in reinforcement learning},
  author={Elfwing, Stefan and Uchibe, Eiji and Doya, Kenji},
  journal={Neural networks},
  volume={107},
  pages={3--11},
  year={2018},
  publisher={Elsevier}
}
@article{press2021train,
  title={Train short, test long: Attention with linear biases enables input length extrapolation},
  author={Press, Ofir and Smith, Noah A and Lewis, Mike},
  journal={arXiv preprint arXiv:2108.12409},
  year={2021}
}


@article{ainslie2023gqa,
  title={Gqa: Training generalized multi-query transformer models from multi-head checkpoints},
  author={Ainslie, Joshua and Lee-Thorp, James and De Jong, Michiel and Zemlyanskiy, Yury and Lebr{\'o}n, Federico and Sanghai, Sumit},
  journal={arXiv preprint arXiv:2305.13245},
  year={2023}
}
@article{voita2019bottom,
  title={The bottom-up evolution of representations in the transformer: A study with machine translation and language modeling objectives},
  author={Voita, Elena and Sennrich, Rico and Titov, Ivan},
  journal={arXiv preprint arXiv:1909.01380},
  year={2019}
}
@article{hu2023llm,
  title={Llm-adapters: An adapter family for parameter-efficient fine-tuning of large language models},
  author={Hu, Zhiqiang and Wang, Lei and Lan, Yihuai and Xu, Wanyu and Lim, Ee-Peng and Bing, Lidong and Xu, Xing and Poria, Soujanya and Lee, Roy Ka-Wei},
  journal={arXiv preprint arXiv:2304.01933},
  year={2023}
}
@article{team2024gemma,
  title={Gemma 2: Improving open language models at a practical size},
  author={Team, Gemma and Riviere, Morgane and Pathak, Shreya and Sessa, Pier Giuseppe and Hardin, Cassidy and Bhupatiraju, Surya and Hussenot, L{\'e}onard and Mesnard, Thomas and Shahriari, Bobak and Ram{\'e}, Alexandre and others},
  journal={arXiv preprint arXiv:2408.00118},
  year={2024}
}
@article{dubey2024llama,
  title={The llama 3 herd of models},
  author={Dubey, Abhimanyu and Jauhri, Abhinav and Pandey, Abhinav and Kadian, Abhishek and Al-Dahle, Ahmad and Letman, Aiesha and Mathur, Akhil and Schelten, Alan and Yang, Amy and Fan, Angela and others},
  journal={arXiv e-prints},
  pages={arXiv--2407},
  year={2024}
}
@article{team2024qwen2,
  title={Qwen2 technical report},
  author={Team, Qwen},
  journal={arXiv preprint arXiv:2407.10671},
  year={2024}
}
% Old

@article{sun2025stronger,
  title={A Stronger Mixture of Low-Rank Experts for Fine-Tuning Foundation Models},
  author={Sun, Mengyang and Wang, Yihao and Feng, Tao and Zhang, Dan and Zhu, Yifan and Tang, Jie},
  journal={arXiv preprint arXiv:2502.15828},
  year={2025}
}
@article{pfeiffer2020mad,
  title={Mad-x: An adapter-based framework for multi-task cross-lingual transfer},
  author={Pfeiffer, Jonas and Vuli{\'c}, Ivan and Gurevych, Iryna and Ruder, Sebastian},
  journal={arXiv preprint arXiv:2005.00052},
  year={2020}
}
@article{raffel2020exploring,
  title={Exploring the limits of transfer learning with a unified text-to-text transformer},
  author={Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J},
  journal={Journal of machine learning research},
  volume={21},
  number={140},
  pages={1--67},
  year={2020}
}
@article{zaken2021bitfit,
  title={Bitfit: Simple parameter-efficient fine-tuning for transformer-based masked language-models},
  author={Zaken, Elad Ben and Ravfogel, Shauli and Goldberg, Yoav},
  journal={arXiv preprint arXiv:2106.10199},
  year={2021}
}
@inproceedings{papineni2002bleu,
  title={Bleu: a method for automatic evaluation of machine translation},
  author={Papineni, Kishore and Roukos, Salim and Ward, Todd and Zhu, Wei-Jing},
  booktitle={Proceedings of the 40th annual meeting of the Association for Computational Linguistics},
  pages={311--318},
  year={2002}
}
@inproceedings{lin2004rouge,
  title={Rouge: A package for automatic evaluation of summaries},
  author={Lin, Chin-Yew},
  booktitle={Text summarization branches out},
  pages={74--81},
  year={2004}
}
@article{jang2016categorical,
  title={Categorical reparameterization with gumbel-softmax},
  author={Jang, Eric and Gu, Shixiang and Poole, Ben},
  journal={arXiv preprint arXiv:1611.01144},
  year={2016}
}
@inproceedings{he2015delving,
  title={Delving deep into rectifiers: Surpassing human-level performance on imagenet classification},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE international conference on computer vision},
  pages={1026--1034},
  year={2015}
}
@article{guo2025nlora,
  title={NLoRA: Nystr$\backslash$" om-Initiated Low-Rank Adaptation for Large Language Models},
  author={Guo, Chenlu and Wu, Yuan and Chang, Yi},
  journal={arXiv preprint arXiv:2502.14482},
  year={2025}
}

@article{ba2016layer,
  title={Layer normalization},
  author={Ba, Jimmy Lei and Kiros, Jamie Ryan and Hinton, Geoffrey E},
  journal={arXiv preprint arXiv:1607.06450},
  year={2016}
}

@article{team2023gemini,
  title={Gemini: a family of highly capable multimodal models},
  author={Team, Gemini and Anil, Rohan and Borgeaud, Sebastian and Alayrac, Jean-Baptiste and Yu, Jiahui and Soricut, Radu and Schalkwyk, Johan and Dai, Andrew M and Hauth, Anja and Millican, Katie and others},
  journal={arXiv preprint arXiv:2312.11805},
  year={2023}
}
@article{liu2023moelora,
  title={Moelora: An moe-based parameter efficient fine-tuning method for multi-task medical applications},
  author={Liu, Qidong and Wu, Xian and Zhao, Xiangyu and Zhu, Yuanshao and Xu, Derong and Tian, Feng and Zheng, Yefeng},
  journal={arXiv preprint arXiv:2310.18339},
  year={2023}
}
@article{wang2023multilora,
  title={Multilora: Democratizing lora for better multi-task learning},
  author={Wang, Yiming and Lin, Yu and Zeng, Xiaodong and Zhang, Guannan},
  journal={arXiv preprint arXiv:2311.11501},
  year={2023}
}
@article{liu2021p,
  title={P-tuning v2: Prompt tuning can be comparable to fine-tuning universally across scales and tasks},
  author={Liu, Xiao and Ji, Kaixuan and Fu, Yicheng and Tam, Weng Lam and Du, Zhengxiao and Yang, Zhilin and Tang, Jie},
  journal={arXiv preprint arXiv:2110.07602},
  year={2021}
}
@article{brown2020language,
  title={Language models are few-shot learners},
  author={Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and others},
  journal={Advances in neural information processing systems},
  volume={33},
  pages={1877--1901},
  year={2020}
}
@article{liu2021conflict,
  title={Conflict-averse gradient descent for multi-task learning},
  author={Liu, Bo and Liu, Xingchao and Jin, Xiaojie and Stone, Peter and Liu, Qiang},
  journal={Advances in Neural Information Processing Systems},
  volume={34},
  pages={18878--18890},
  year={2021}
}
@article{navon2022multi,
  title={Multi-task learning as a bargaining game},
  author={Navon, Aviv and Shamsian, Aviv and Achituve, Idan and Maron, Haggai and Kawaguchi, Kenji and Chechik, Gal and Fetaya, Ethan},
  journal={arXiv preprint arXiv:2202.01017},
  year={2022}
}
@article{yu2020gradient,
  title={Gradient surgery for multi-task learning},
  author={Yu, Tianhe and Kumar, Saurabh and Gupta, Abhishek and Levine, Sergey and Hausman, Karol and Finn, Chelsea},
  journal={Advances in Neural Information Processing Systems},
  volume={33},
  pages={5824--5836},
  year={2020}
}
@article{renduchintala2023tied,
  title={Tied-lora: Enhacing parameter efficiency of lora with weight tying},
  author={Renduchintala, Adithya and Konuk, Tugrul and Kuchaiev, Oleksii},
  journal={arXiv preprint arXiv:2311.09578},
  year={2023}
}
@inproceedings{kwon2023efficient,
  title={Efficient memory management for large language model serving with pagedattention},
  author={Kwon, Woosuk and Li, Zhuohan and Zhuang, Siyuan and Sheng, Ying and Zheng, Lianmin and Yu, Cody Hao and Gonzalez, Joseph and Zhang, Hao and Stoica, Ion},
  booktitle={Proceedings of the 29th Symposium on Operating Systems Principles},
  pages={611--626},
  year={2023}
}

@article{dai2024deepseekmoe,
  title={Deepseekmoe: Towards ultimate expert specialization in mixture-of-experts language models},
  author={Dai, Damai and Deng, Chengqi and Zhao, Chenggang and Xu, RX and Gao, Huazuo and Chen, Deli and Li, Jiashi and Zeng, Wangding and Yu, Xingkai and Wu, Y and others},
  journal={arXiv preprint arXiv:2401.06066},
  year={2024}
}
@article{guo2025deepseek,
  title={Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning},
  author={Guo, Daya and Yang, Dejian and Zhang, Haowei and Song, Junxiao and Zhang, Ruoyu and Xu, Runxin and Zhu, Qihao and Ma, Shirong and Wang, Peiyi and Bi, Xiao and others},
  journal={arXiv preprint arXiv:2501.12948},
  year={2025}
}
@article{shazeer2017outrageously,
  title={Outrageously large neural networks: The sparsely-gated mixture-of-experts layer},
  author={Shazeer, Noam and Mirhoseini, Azalia and Maziarz, Krzysztof and Davis, Andy and Le, Quoc and Hinton, Geoffrey and Dean, Jeff},
  journal={arXiv preprint arXiv:1701.06538},
  year={2017}
}
@inproceedings{rajbhandari2022deepspeed,
  title={Deepspeed-moe: Advancing mixture-of-experts inference and training to power next-generation ai scale},
  author={Rajbhandari, Samyam and Li, Conglong and Yao, Zhewei and Zhang, Minjia and Aminabadi, Reza Yazdani and Awan, Ammar Ahmad and Rasley, Jeff and He, Yuxiong},
  booktitle={International conference on machine learning},
  pages={18332--18346},
  year={2022},
  organization={PMLR}
}
@article{zhang2023instruction,
  title={Instruction tuning for large language models: A survey},
  author={Zhang, Shengyu and Dong, Linfeng and Li, Xiaoya and Zhang, Sen and Sun, Xiaofei and Wang, Shuhe and Li, Jiwei and Hu, Runyi and Zhang, Tianwei and Wu, Fei and others},
  journal={arXiv preprint arXiv:2308.10792},
  year={2023}
}
@article{pfeiffer2020adapterfusion,
  title={Adapterfusion: Non-destructive task composition for transfer learning},
  author={Pfeiffer, Jonas and Kamath, Aishwarya and R{\"u}ckl{\'e}, Andreas and Cho, Kyunghyun and Gurevych, Iryna},
  journal={arXiv preprint arXiv:2005.00247},
  year={2020}
}
@article{pfeiffer2020adapterhub,
  title={Adapterhub: A framework for adapting transformers},
  author={Pfeiffer, Jonas and R{\"u}ckl{\'e}, Andreas and Poth, Clifton and Kamath, Aishwarya and Vuli{\'c}, Ivan and Ruder, Sebastian and Cho, Kyunghyun and Gurevych, Iryna},
  journal={arXiv preprint arXiv:2007.07779},
  year={2020}
}

@article{lu2023uniadapter,
  title={Uniadapter: Unified parameter-efficient transfer learning for cross-modal modeling},
  author={Lu, Haoyu and Huo, Yuqi and Yang, Guoxing and Lu, Zhiwu and Zhan, Wei and Tomizuka, Masayoshi and Ding, Mingyu},
  journal={arXiv preprint arXiv:2302.06605},
  year={2023}
}

@article{fedus2022switch,
  title={Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity},
  author={Fedus, William and Zoph, Barret and Shazeer, Noam},
  journal={Journal of Machine Learning Research},
  volume={23},
  number={120},
  pages={1--39},
  year={2022}
}
@article{lepikhin2020gshard,
  title={Gshard: Scaling giant models with conditional computation and automatic sharding},
  author={Lepikhin, Dmitry and Lee, HyoukJoong and Xu, Yuanzhong and Chen, Dehao and Firat, Orhan and Huang, Yanping and Krikun, Maxim and Shazeer, Noam and Chen, Zhifeng},
  journal={arXiv preprint arXiv:2006.16668},
  year={2020}
}
@article{luo2024moelora,
  title={Moelora: Contrastive learning guided mixture of experts on parameter-efficient fine-tuning for large language models},
  author={Luo, Tongxu and Lei, Jiahe and Lei, Fangyu and Liu, Weihao and He, Shizhu and Zhao, Jun and Liu, Kang},
  journal={arXiv preprint arXiv:2402.12851},
  year={2024}
}
@article{guo2024large,
  title={Large language model based multi-agents: A survey of progress and challenges},
  author={Guo, Taicheng and Chen, Xiuying and Wang, Yaqi and Chang, Ruidi and Pei, Shichao and Chawla, Nitesh V and Wiest, Olaf and Zhang, Xiangliang},
  journal={arXiv preprint arXiv:2402.01680},
  year={2024}
}
@article{zhao2023survey,
  title={A survey of large language models},
  author={Zhao, Wayne Xin and Zhou, Kun and Li, Junyi and Tang, Tianyi and Wang, Xiaolei and Hou, Yupeng and Min, Yingqian and Zhang, Beichen and Zhang, Junjie and Dong, Zican and others},
  journal={arXiv preprint arXiv:2303.18223},
  year={2023}
}
@article{gao2024higher,
  title={Higher layers need more lora experts},
  author={Gao, Chongyang and Chen, Kezhen and Rao, Jinmeng and Sun, Baochen and Liu, Ruibo and Peng, Daiyi and Zhang, Yawen and Guo, Xiaoyuan and Yang, Jie and Subrahmanian, VS},
  journal={arXiv preprint arXiv:2402.08562},
  year={2024}
}
@inproceedings{dou2024loramoe,
  title={LoRAMoE: Alleviating world knowledge forgetting in large language models via MoE-style plugin},
  author={Dou, Shihan and Zhou, Enyu and Liu, Yan and Gao, Songyang and Shen, Wei and Xiong, Limao and Zhou, Yuhao and Wang, Xiao and Xi, Zhiheng and Fan, Xiaoran and others},
  booktitle={Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  pages={1932--1945},
  year={2024}
}


@article{achiam2023gpt,
  title={Gpt-4 technical report},
  author={Achiam, Josh and Adler, Steven and Agarwal, Sandhini and Ahmad, Lama and Akkaya, Ilge and Aleman, Florencia Leoni and Almeida, Diogo and Altenschmidt, Janko and Altman, Sam and Anadkat, Shyamal and others},
  journal={arXiv preprint arXiv:2303.08774},
  year={2023}
}
@article{jaszczur2021sparse,
  title={Sparse is enough in scaling transformers},
  author={Jaszczur, Sebastian and Chowdhery, Aakanksha and Mohiuddin, Afroz and Kaiser, Lukasz and Gajewski, Wojciech and Michalewski, Henryk and Kanerva, Jonni},
  journal={Advances in Neural Information Processing Systems},
  volume={34},
  pages={9895--9907},
  year={2021}
}
@inproceedings{standley2020tasks,
  title={Which tasks should be learned together in multi-task learning?},
  author={Standley, Trevor and Zamir, Amir and Chen, Dawn and Guibas, Leonidas and Malik, Jitendra and Savarese, Silvio},
  booktitle={International conference on machine learning},
  pages={9120--9132},
  year={2020},
  organization={PMLR}
}
@article{cai2024survey,
  title={A survey on mixture of experts},
  author={Cai, Weilin and Jiang, Juyong and Wang, Fan and Tang, Jing and Kim, Sunghun and Huang, Jiayi},
  journal={arXiv preprint arXiv:2407.06204},
  year={2024}
}
@article{karimi2021compacter,
  title={Compacter: Efficient low-rank hypercomplex adapter layers},
  author={Karimi Mahabadi, Rabeeh and Henderson, James and Ruder, Sebastian},
  journal={Advances in Neural Information Processing Systems},
  volume={34},
  pages={1022--1035},
  year={2021}
}
@article{bommasani2021opportunities,
  title={On the opportunities and risks of foundation models},
  author={Bommasani, Rishi and Hudson, Drew A and Adeli, Ehsan and Altman, Russ and Arora, Simran and von Arx, Sydney and Bernstein, Michael S and Bohg, Jeannette and Bosselut, Antoine and Brunskill, Emma and others},
  journal={arXiv preprint arXiv:2108.07258},
  year={2021}
}
@article{pan2024lisa,
  title={LISA: Layerwise Importance Sampling for Memory-Efficient Large Language Model Fine-Tuning},
  author={Pan, Rui and Liu, Xiang and Diao, Shizhe and Pi, Renjie and Zhang, Jipeng and Han, Chi and Zhang, Tong},
  journal={arXiv preprint arXiv:2403.17919},
  year={2024}
}
@article{feng2024mixture,
  title={Mixture-of-loras: An efficient multitask tuning for large language models},
  author={Feng, Wenfeng and Hao, Chuzhan and Zhang, Yuewei and Han, Yu and Wang, Hao},
  journal={arXiv preprint arXiv:2403.03432},
  year={2024}
}
@article{lester2021power,
  title={The power of scale for parameter-efficient prompt tuning},
  author={Lester, Brian and Al-Rfou, Rami and Constant, Noah},
  journal={arXiv preprint arXiv:2104.08691},
  year={2021}
}
@article{zhou2024lima,
  title={Lima: Less is more for alignment},
  author={Zhou, Chunting and Liu, Pengfei and Xu, Puxin and Iyer, Srinivasan and Sun, Jiao and Mao, Yuning and Ma, Xuezhe and Efrat, Avia and Yu, Ping and Yu, Lili and others},
  journal={Advances in Neural Information Processing Systems},
  volume={36},
  year={2024}
}
@article{wei2021finetuned,
  title={Finetuned language models are zero-shot learners},
  author={Wei, Jason and Bosma, Maarten and Zhao, Vincent Y and Guu, Kelvin and Yu, Adams Wei and Lester, Brian and Du, Nan and Dai, Andrew M and Le, Quoc V},
  journal={arXiv preprint arXiv:2109.01652},
  year={2021}
}

@article{brynjolfsson2025generative,
  title={Generative AI at work},
  author={Brynjolfsson, Erik and Li, Danielle and Raymond, Lindsey},
  journal={The Quarterly Journal of Economics},
  pages={qjae044},
  year={2025},
  publisher={Oxford University Press}
}
@Misc{peft,
  title =        {PEFT: State-of-the-art Parameter-Efficient Fine-Tuning methods},
  author =       {Sourab Mangrulkar and Sylvain Gugger and Lysandre Debut and Younes Belkada and Sayak Paul and Benjamin Bossan},
  howpublished = {\url{https://github.com/huggingface/peft}},
  year =         {2022}
}
@article{li2023chatdoctor,
  title={ChatDoctor: A Medical Chat Model Fine-Tuned on a Large Language Model Meta-AI (LLaMA) Using Medical Domain Knowledge},
  author={Li, Yunxiang and Li, Zihan and Zhang, Kai and Dan, Ruilong and Jiang, Steve and Zhang, You},
  journal={Cureus},
  volume={15},
  number={6},
  year={2023},
  publisher={Cureus}
}
@online{DatabricksBlog2023DollyV2,
    author    = {Mike Conover and Matt Hayes and Ankit Mathur and Jianwei Xie and Jun Wan and Sam Shah and Ali Ghodsi and Patrick Wendell and Matei Zaharia and Reynold Xin},
    title     = {Free Dolly: Introducing the World's First Truly Open Instruction-Tuned LLM},
    year      = {2023},
    url       = {https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm},
    urldate   = {2023-06-30}
}
@inproceedings{nakano2021webgpt,
  author = {Reiichiro Nakano and Jacob Hilton and Suchir Balaji and Jeff Wu and Long Ouyang and Christina Kim and Christopher Hesse and Shantanu Jain and Vineet Kosaraju and William Saunders and Xu Jiang and Karl Cobbe and Tyna Eloundou and Gretchen Krueger and Kevin Button and Matthew Knight and Benjamin Chess and John Schulman},
  title = {WebGPT: Browser-assisted question-answering with human feedback},
  booktitle = {arXiv},
  year = 2021,
}
@inproceedings{zhang2023automatic,
  title={Automatic Chain of Thought Prompting in Large Language Models},
  author={Zhang, Zhuosheng and Zhang, Aston and Li, Mu and Smola, Alex},
  booktitle={The Eleventh International Conference on Learning Representations (ICLR 2023)},
  year={2023}
}
@misc{codealpaca,
  author = {Sahil Chaudhary},
  title = {Code Alpaca: An Instruction-following LLaMA model for code generation},
  year = {2023},
  publisher = {GitHub},
  journal = {GitHub repository},
  howpublished = {\url{https://github.com/sahil280114/codealpaca}},
}
@article{zhao2024hypermoe,
  title={HyperMoE: Towards Better Mixture of Experts via Transferring Among Experts},
  author={Zhao, Hao and Qiu, Zihan and Wu, Huijia and Wang, Zili and He, Zhaofeng and Fu, Jie},
  journal={arXiv preprint arXiv:2402.12656},
  year={2024}
}