610 lines
27 KiB
BibTeX
Executable File
610 lines
27 KiB
BibTeX
Executable File
@article{fu2025training,
|
|
title={Training-free LLM Merging for Multi-task Learning},
|
|
author={Fu, Zichuan and Wu, Xian and Wang, Yejing and Wang, Wanyu and Ye, Shanshan and Yin, Hongzhi and Chang, Yi and Zheng, Yefeng and Zhao, Xiangyu},
|
|
journal={arXiv preprint arXiv:2506.12379},
|
|
year={2025}
|
|
}
|
|
@inproceedings{wang2025put,
|
|
title={Put Teacher in Student's Shoes: Cross-Distillation for Ultra-compact Model Compression Framework},
|
|
author={Wang, Maolin and Chu, Jun and Xie, Sicong and Zang, Xiaoling and Zhao, Yao and Zhong, Wenliang and Zhao, Xiangyu},
|
|
booktitle={Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining V. 2},
|
|
pages={4975--4985},
|
|
year={2025}
|
|
}
|
|
@article{wang2023large,
|
|
title={Large multimodal model compression via efficient pruning and distillation at AntGroup},
|
|
author={Wang, Maolin and Zhao, Yao and Liu, Jiajia and Chen, Jingdong and Zhuang, Chenyi and Gu, Jinjie and Guo, Ruocheng and Zhao, Xiangyu},
|
|
journal={arXiv preprint arXiv:2312.05795},
|
|
year={2023}
|
|
}
|
|
@inproceedings{liu2024moe,
|
|
title={When moe meets llms: Parameter efficient fine-tuning for multi-task medical applications},
|
|
author={Liu, Qidong and Wu, Xian and Zhao, Xiangyu and Zhu, Yuanshao and Xu, Derong and Tian, Feng and Zheng, Yefeng},
|
|
booktitle={Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval},
|
|
pages={1104--1114},
|
|
year={2024}
|
|
}
|
|
@inproceedings{wang2025metalora,
|
|
title={MetaLoRA: Tensor-Enhanced Adaptive Low-Rank Fine-Tuning},
|
|
author={Wang, Maolin and Zhao, Xiangyu and Guo, Ruocheng and Wang, Junhui},
|
|
booktitle={2025 IEEE 41st International Conference on Data Engineering (ICDE)},
|
|
pages={4680--4684},
|
|
year={2025},
|
|
organization={IEEE}
|
|
}
|
|
@inproceedings{wang2024llm4msr,
|
|
title={Llm4msr: An llm-enhanced paradigm for multi-scenario recommendation},
|
|
author={Wang, Yuhao and Wang, Yichao and Fu, Zichuan and Li, Xiangyang and Wang, Wanyu and Ye, Yuyang and Zhao, Xiangyu and Guo, Huifeng and Tang, Ruiming},
|
|
booktitle={Proceedings of the 33rd ACM International Conference on Information and Knowledge Management},
|
|
pages={2472--2481},
|
|
year={2024}
|
|
}
|
|
@article{luo2024moelora,
|
|
title={Moelora: Contrastive learning guided mixture of experts on parameter-efficient fine-tuning for large language models},
|
|
author={Luo, Tongxu and Lei, Jiahe and Lei, Fangyu and Liu, Weihao and He, Shizhu and Zhao, Jun and Liu, Kang},
|
|
journal={arXiv preprint arXiv:2402.12851},
|
|
year={2024}
|
|
}
|
|
@article{guo2024large,
|
|
title={Large language model based multi-agents: A survey of progress and challenges},
|
|
author={Guo, Taicheng and Chen, Xiuying and Wang, Yaqi and Chang, Ruidi and Pei, Shichao and Chawla, Nitesh V and Wiest, Olaf and Zhang, Xiangliang},
|
|
journal={arXiv preprint arXiv:2402.01680},
|
|
year={2024}
|
|
}
|
|
@article{zhao2023survey,
|
|
title={A survey of large language models},
|
|
author={Zhao, Wayne Xin and Zhou, Kun and Li, Junyi and Tang, Tianyi and Wang, Xiaolei and Hou, Yupeng and Min, Yingqian and Zhang, Beichen and Zhang, Junjie and Dong, Zican and others},
|
|
journal={arXiv preprint arXiv:2303.18223},
|
|
year={2023}
|
|
}
|
|
@article{gao2024higher,
|
|
title={Higher layers need more lora experts},
|
|
author={Gao, Chongyang and Chen, Kezhen and Rao, Jinmeng and Sun, Baochen and Liu, Ruibo and Peng, Daiyi and Zhang, Yawen and Guo, Xiaoyuan and Yang, Jie and Subrahmanian, VS},
|
|
journal={arXiv preprint arXiv:2402.08562},
|
|
year={2024}
|
|
}
|
|
@article{ji2023multi,
|
|
title={Multi-factor spatio-temporal prediction based on graph decomposition learning},
|
|
author={Ji, Jiahao and Wang, Jingyuan and Mou, Yu and Long, Cheng},
|
|
journal={arXiv preprint arXiv:2310.10374},
|
|
year={2023}
|
|
}
|
|
@article{ji2025seeing,
|
|
title={Seeing the unseen: Learning basis confounder representations for robust traffic prediction},
|
|
author={Ji, Jiahao and Zhang, Wentao and Wang, Jingyuan and Huang, Chao},
|
|
year={2025}
|
|
}
|
|
@inproceedings{wang2025gtg,
|
|
title={GTG: Generalizable Trajectory Generation Model for Urban Mobility},
|
|
author={Wang, Jingyuan and Lin, Yujing and Li, Yudong},
|
|
booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
|
|
volume={39},
|
|
number={1},
|
|
pages={834--842},
|
|
year={2025}
|
|
}
|
|
@inproceedings{cheng2025poi,
|
|
title={Poi-enhancer: An llm-based semantic enhancement framework for poi representation learning},
|
|
author={Cheng, Jiawei and Wang, Jingyuan and Zhang, Yichuan and Ji, Jiahao and Zhu, Yuanshao and Zhang, Zhibo and Zhao, Xiangyu},
|
|
booktitle={Proceedings of the AAAI conference on artificial intelligence},
|
|
volume={39},
|
|
number={11},
|
|
pages={11509--11517},
|
|
year={2025}
|
|
}
|
|
@inproceedings{han2025bridging,
|
|
title={Bridging traffic state and trajectory for dynamic road network and trajectory representation learning},
|
|
author={Han, Chengkai and Wang, Jingyuan and Wang, Yongyao and Yu, Xie and Lin, Hao and Li, Chao and Wu, Junjie},
|
|
booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
|
|
volume={39},
|
|
number={11},
|
|
pages={11763--11771},
|
|
year={2025}
|
|
}
|
|
|
|
@inproceedings{yu2025bigcity,
|
|
title={BIGCity: A universal spatiotemporal model for unified trajectory and traffic state data analysis},
|
|
author={Yu, Xie and Wang, Jingyuan and Yang, Yifan and Huang, Qian and Qu, Ke},
|
|
booktitle={2025 IEEE 41st International Conference on Data Engineering (ICDE)},
|
|
pages={4455--4469},
|
|
year={2025},
|
|
organization={IEEE}
|
|
}
|
|
|
|
@article{zhang2024veccity,
|
|
title={VecCity: A taxonomy-guided library for map entity representation learning},
|
|
author={Zhang, Wentao and Wang, Jingyuan and Yang, Yifan and others},
|
|
journal={arXiv preprint arXiv:2411.00874},
|
|
year={2024}
|
|
}
|
|
|
|
@article{hettige2024airphynet,
|
|
title={Airphynet: Harnessing physics-guided neural networks for air quality prediction},
|
|
author={Hettige, Kethmi Hirushini and Ji, Jiahao and Xiang, Shili and Long, Cheng and Cong, Gao and Wang, Jingyuan},
|
|
journal={arXiv preprint arXiv:2402.03784},
|
|
year={2024}
|
|
}
|
|
|
|
|
|
@article{wang2023rethinking,
|
|
title={Rethinking the evaluation for conversational recommendation in the era of large language models},
|
|
author={Wang, Xiaolei and Tang, Xinyu and Zhao, Wayne Xin and Wang, Jingyuan and Wen, Ji-Rong},
|
|
journal={arXiv preprint arXiv:2305.13112},
|
|
year={2023}
|
|
}
|
|
|
|
|
|
@article{li2023web,
|
|
title={The web can be your oyster for improving large language models},
|
|
author={Li, Junyi and Tang, Tianyi and Zhao, Wayne Xin and Wang, Jingyuan and Nie, Jian-Yun and Wen, Ji-Rong},
|
|
journal={arXiv preprint arXiv:2305.10998},
|
|
year={2023}
|
|
}
|
|
@article{du2021gan,
|
|
title={GAN-based anomaly detection for multivariate time series using polluted training set},
|
|
author={Du, Bowen and Sun, Xuanxuan and Ye, Junchen and Cheng, Ke and Wang, Jingyuan and Sun, Leilei},
|
|
journal={IEEE Transactions on Knowledge and Data Engineering},
|
|
volume={35},
|
|
number={12},
|
|
pages={12208--12219},
|
|
year={2021},
|
|
publisher={IEEE}
|
|
}
|
|
|
|
@article{li2023e4srec,
|
|
title={E4srec: An elegant effective efficient extensible solution of large language models for sequential recommendation},
|
|
author={Li, Xinhang and Chen, Chong and Zhao, Xiangyu and Zhang, Yong and Xing, Chunxiao},
|
|
journal={arXiv preprint arXiv:2312.02443},
|
|
year={2023}
|
|
}
|
|
@article{fu2025sliding,
|
|
title={Sliding Window Attention Training for Efficient Large Language Models},
|
|
author={Fu, Zichuan and Song, Wentao and Wang, Yejing and Wu, Xian and Zheng, Yefeng and Zhang, Yingying and Xu, Derong and Wei, Xuetao and Xu, Tong and Zhao, Xiangyu},
|
|
journal={arXiv preprint arXiv:2502.18845},
|
|
year={2025}
|
|
}
|
|
|
|
@article{wang2023multi,
|
|
title={Multi-task deep recommender systems: A survey},
|
|
author={Wang, Yuhao and Lam, Ha Tsz and Wong, Yi and Liu, Ziru and Zhao, Xiangyu and Wang, Yichao and Chen, Bo and Guo, Huifeng and Tang, Ruiming},
|
|
journal={arXiv preprint arXiv:2302.03525},
|
|
year={2023}
|
|
}
|
|
@inproceedings{liu2023multi,
|
|
title={Multi-task recommendations with reinforcement learning},
|
|
author={Liu, Ziru and Tian, Jiejie and Cai, Qingpeng and Zhao, Xiangyu and Gao, Jingtong and Liu, Shuchang and Chen, Dayou and He, Tonghao and Zheng, Dong and Jiang, Peng and others},
|
|
booktitle={Proceedings of the ACM web conference 2023},
|
|
pages={1273--1282},
|
|
year={2023}
|
|
}
|
|
@inproceedings{liu2025multi,
|
|
title={Multi-task Offline Reinforcement Learning for Online Advertising in Recommender Systems},
|
|
author={Liu, Langming and Wang, Wanyu and Zhang, Chi and Li, Bo and Yin, Hongzhi and Wei, Xuetao and Su, Wenbo and Zheng, Bo and Zhao, Xiangyu},
|
|
booktitle={Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining V. 2},
|
|
pages={4635--4646},
|
|
year={2025}
|
|
}
|
|
|
|
|
|
@inproceedings{he2015delving,
|
|
title={Delving deep into rectifiers: Surpassing human-level performance on imagenet classification},
|
|
author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
|
|
booktitle={Proceedings of the IEEE international conference on computer vision},
|
|
pages={1026--1034},
|
|
year={2015}
|
|
}
|
|
@article{guo2025nlora,
|
|
title={NLoRA: Nystr$\backslash$" om-Initiated Low-Rank Adaptation for Large Language Models},
|
|
author={Guo, Chenlu and Wu, Yuan and Chang, Yi},
|
|
journal={arXiv preprint arXiv:2502.14482},
|
|
year={2025}
|
|
}
|
|
@article{elfwing2018sigmoid,
|
|
title={Sigmoid-weighted linear units for neural network function approximation in reinforcement learning},
|
|
author={Elfwing, Stefan and Uchibe, Eiji and Doya, Kenji},
|
|
journal={Neural networks},
|
|
volume={107},
|
|
pages={3--11},
|
|
year={2018},
|
|
publisher={Elsevier}
|
|
}
|
|
@article{vaswani2017attention,
|
|
title={Attention is all you need},
|
|
author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
|
|
journal={Advances in neural information processing systems},
|
|
volume={30},
|
|
year={2017}
|
|
}
|
|
@article{ba2016layer,
|
|
title={Layer normalization},
|
|
author={Ba, Jimmy Lei and Kiros, Jamie Ryan and Hinton, Geoffrey E},
|
|
journal={arXiv preprint arXiv:1607.06450},
|
|
year={2016}
|
|
}
|
|
@article{jin2025massive,
|
|
title={Massive Values in Self-Attention Modules are the Key to Contextual Knowledge Understanding},
|
|
author={Jin, Mingyu and Mei, Kai and Xu, Wujiang and Sun, Mingjie and Tang, Ruixiang and Du, Mengnan and Liu, Zirui and Zhang, Yongfeng},
|
|
journal={arXiv preprint arXiv:2502.01563},
|
|
year={2025}
|
|
}
|
|
@inproceedings{geva2021transformer,
|
|
title={Transformer Feed-Forward Layers Are Key-Value Memories},
|
|
author={Geva, Mor and Schuster, Roei and Berant, Jonathan and Levy, Omer},
|
|
booktitle={Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing},
|
|
pages={5484--5495},
|
|
year={2021}
|
|
}
|
|
@article{team2023gemini,
|
|
title={Gemini: a family of highly capable multimodal models},
|
|
author={Team, Gemini and Anil, Rohan and Borgeaud, Sebastian and Alayrac, Jean-Baptiste and Yu, Jiahui and Soricut, Radu and Schalkwyk, Johan and Dai, Andrew M and Hauth, Anja and Millican, Katie and others},
|
|
journal={arXiv preprint arXiv:2312.11805},
|
|
year={2023}
|
|
}
|
|
@article{liu2023moelora,
|
|
title={Moelora: An moe-based parameter efficient fine-tuning method for multi-task medical applications},
|
|
author={Liu, Qidong and Wu, Xian and Zhao, Xiangyu and Zhu, Yuanshao and Xu, Derong and Tian, Feng and Zheng, Yefeng},
|
|
journal={arXiv preprint arXiv:2310.18339},
|
|
year={2023}
|
|
}
|
|
@article{wang2023multilora,
|
|
title={Multilora: Democratizing lora for better multi-task learning},
|
|
author={Wang, Yiming and Lin, Yu and Zeng, Xiaodong and Zhang, Guannan},
|
|
journal={arXiv preprint arXiv:2311.11501},
|
|
year={2023}
|
|
}
|
|
@article{liu2021p,
|
|
title={P-tuning v2: Prompt tuning can be comparable to fine-tuning universally across scales and tasks},
|
|
author={Liu, Xiao and Ji, Kaixuan and Fu, Yicheng and Tam, Weng Lam and Du, Zhengxiao and Yang, Zhilin and Tang, Jie},
|
|
journal={arXiv preprint arXiv:2110.07602},
|
|
year={2021}
|
|
}
|
|
@article{brown2020language,
|
|
title={Language models are few-shot learners},
|
|
author={Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and others},
|
|
journal={Advances in neural information processing systems},
|
|
volume={33},
|
|
pages={1877--1901},
|
|
year={2020}
|
|
}
|
|
@article{liu2021conflict,
|
|
title={Conflict-averse gradient descent for multi-task learning},
|
|
author={Liu, Bo and Liu, Xingchao and Jin, Xiaojie and Stone, Peter and Liu, Qiang},
|
|
journal={Advances in Neural Information Processing Systems},
|
|
volume={34},
|
|
pages={18878--18890},
|
|
year={2021}
|
|
}
|
|
@article{navon2022multi,
|
|
title={Multi-task learning as a bargaining game},
|
|
author={Navon, Aviv and Shamsian, Aviv and Achituve, Idan and Maron, Haggai and Kawaguchi, Kenji and Chechik, Gal and Fetaya, Ethan},
|
|
journal={arXiv preprint arXiv:2202.01017},
|
|
year={2022}
|
|
}
|
|
|
|
@inproceedings{wang2023wavelet,
|
|
title={WHEN: A Wavelet-DTW hybrid attention network for heterogeneous time series analysis},
|
|
author={Wang, Jingyuan and Yang, Chen and Jiang, Xiaohan and Wu, Junjie},
|
|
booktitle={Proceedings of the 29th ACM SIGKDD conference on knowledge discovery and data mining},
|
|
pages={2361--2373},
|
|
year={2023}
|
|
}
|
|
|
|
@article{sun2025stronger,
|
|
title={A Stronger Mixture of Low-Rank Experts for Fine-Tuning Foundation Models},
|
|
author={Sun, Mengyang and Wang, Yihao and Feng, Tao and Zhang, Dan and Zhu, Yifan and Tang, Jie},
|
|
journal={arXiv preprint arXiv:2502.15828},
|
|
year={2025}
|
|
}
|
|
@article{pfeiffer2020mad,
|
|
title={Mad-x: An adapter-based framework for multi-task cross-lingual transfer},
|
|
author={Pfeiffer, Jonas and Vuli{\'c}, Ivan and Gurevych, Iryna and Ruder, Sebastian},
|
|
journal={arXiv preprint arXiv:2005.00052},
|
|
year={2020}
|
|
}
|
|
@article{raffel2020exploring,
|
|
title={Exploring the limits of transfer learning with a unified text-to-text transformer},
|
|
author={Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J},
|
|
journal={Journal of machine learning research},
|
|
volume={21},
|
|
number={140},
|
|
pages={1--67},
|
|
year={2020}
|
|
}
|
|
@article{zaken2021bitfit,
|
|
title={Bitfit: Simple parameter-efficient fine-tuning for transformer-based masked language-models},
|
|
author={Zaken, Elad Ben and Ravfogel, Shauli and Goldberg, Yoav},
|
|
journal={arXiv preprint arXiv:2106.10199},
|
|
year={2021}
|
|
}
|
|
@inproceedings{papineni2002bleu,
|
|
title={Bleu: a method for automatic evaluation of machine translation},
|
|
author={Papineni, Kishore and Roukos, Salim and Ward, Todd and Zhu, Wei-Jing},
|
|
booktitle={Proceedings of the 40th annual meeting of the Association for Computational Linguistics},
|
|
pages={311--318},
|
|
year={2002}
|
|
}
|
|
@inproceedings{lin2004rouge,
|
|
title={Rouge: A package for automatic evaluation of summaries},
|
|
author={Lin, Chin-Yew},
|
|
booktitle={Text summarization branches out},
|
|
pages={74--81},
|
|
year={2004}
|
|
}
|
|
@article{jang2016categorical,
|
|
title={Categorical reparameterization with gumbel-softmax},
|
|
author={Jang, Eric and Gu, Shixiang and Poole, Ben},
|
|
journal={arXiv preprint arXiv:1611.01144},
|
|
year={2016}
|
|
}
|
|
@article{yu2020gradient,
|
|
title={Gradient surgery for multi-task learning},
|
|
author={Yu, Tianhe and Kumar, Saurabh and Gupta, Abhishek and Levine, Sergey and Hausman, Karol and Finn, Chelsea},
|
|
journal={Advances in Neural Information Processing Systems},
|
|
volume={33},
|
|
pages={5824--5836},
|
|
year={2020}
|
|
}
|
|
@article{renduchintala2023tied,
|
|
title={Tied-lora: Enhacing parameter efficiency of lora with weight tying},
|
|
author={Renduchintala, Adithya and Konuk, Tugrul and Kuchaiev, Oleksii},
|
|
journal={arXiv preprint arXiv:2311.09578},
|
|
year={2023}
|
|
}
|
|
@inproceedings{kwon2023efficient,
|
|
title={Efficient memory management for large language model serving with pagedattention},
|
|
author={Kwon, Woosuk and Li, Zhuohan and Zhuang, Siyuan and Sheng, Ying and Zheng, Lianmin and Yu, Cody Hao and Gonzalez, Joseph and Zhang, Hao and Stoica, Ion},
|
|
booktitle={Proceedings of the 29th Symposium on Operating Systems Principles},
|
|
pages={611--626},
|
|
year={2023}
|
|
}
|
|
|
|
@article{dai2024deepseekmoe,
|
|
title={Deepseekmoe: Towards ultimate expert specialization in mixture-of-experts language models},
|
|
author={Dai, Damai and Deng, Chengqi and Zhao, Chenggang and Xu, RX and Gao, Huazuo and Chen, Deli and Li, Jiashi and Zeng, Wangding and Yu, Xingkai and Wu, Y and others},
|
|
journal={arXiv preprint arXiv:2401.06066},
|
|
year={2024}
|
|
}
|
|
@inproceedings{houlsby2019parameter,
|
|
title={Parameter-efficient transfer learning for NLP},
|
|
author={Houlsby, Neil and Giurgiu, Andrei and Jastrzebski, Stanislaw and Morrone, Bruna and De Laroussilhe, Quentin and Gesmundo, Andrea and Attariyan, Mona and Gelly, Sylvain},
|
|
booktitle={International conference on machine learning},
|
|
pages={2790--2799},
|
|
year={2019},
|
|
organization={PMLR}
|
|
}
|
|
@article{guo2025deepseek,
|
|
title={Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning},
|
|
author={Guo, Daya and Yang, Dejian and Zhang, Haowei and Song, Junxiao and Zhang, Ruoyu and Xu, Runxin and Zhu, Qihao and Ma, Shirong and Wang, Peiyi and Bi, Xiao and others},
|
|
journal={arXiv preprint arXiv:2501.12948},
|
|
year={2025}
|
|
}
|
|
@article{shazeer2017outrageously,
|
|
title={Outrageously large neural networks: The sparsely-gated mixture-of-experts layer},
|
|
author={Shazeer, Noam and Mirhoseini, Azalia and Maziarz, Krzysztof and Davis, Andy and Le, Quoc and Hinton, Geoffrey and Dean, Jeff},
|
|
journal={arXiv preprint arXiv:1701.06538},
|
|
year={2017}
|
|
}
|
|
@inproceedings{rajbhandari2022deepspeed,
|
|
title={Deepspeed-moe: Advancing mixture-of-experts inference and training to power next-generation ai scale},
|
|
author={Rajbhandari, Samyam and Li, Conglong and Yao, Zhewei and Zhang, Minjia and Aminabadi, Reza Yazdani and Awan, Ammar Ahmad and Rasley, Jeff and He, Yuxiong},
|
|
booktitle={International conference on machine learning},
|
|
pages={18332--18346},
|
|
year={2022},
|
|
organization={PMLR}
|
|
}
|
|
@article{zhang2023instruction,
|
|
title={Instruction tuning for large language models: A survey},
|
|
author={Zhang, Shengyu and Dong, Linfeng and Li, Xiaoya and Zhang, Sen and Sun, Xiaofei and Wang, Shuhe and Li, Jiwei and Hu, Runyi and Zhang, Tianwei and Wu, Fei and others},
|
|
journal={arXiv preprint arXiv:2308.10792},
|
|
year={2023}
|
|
}
|
|
@article{han2024parameter,
|
|
title={Parameter-efficient fine-tuning for large models: A comprehensive survey},
|
|
author={Han, Zeyu and Gao, Chao and Liu, Jinyang and Zhang, Jeff and Zhang, Sai Qian},
|
|
journal={arXiv preprint arXiv:2403.14608},
|
|
year={2024}
|
|
}
|
|
@article{pfeiffer2020adapterfusion,
|
|
title={Adapterfusion: Non-destructive task composition for transfer learning},
|
|
author={Pfeiffer, Jonas and Kamath, Aishwarya and R{\"u}ckl{\'e}, Andreas and Cho, Kyunghyun and Gurevych, Iryna},
|
|
journal={arXiv preprint arXiv:2005.00247},
|
|
year={2020}
|
|
}
|
|
@article{pfeiffer2020adapterhub,
|
|
title={Adapterhub: A framework for adapting transformers},
|
|
author={Pfeiffer, Jonas and R{\"u}ckl{\'e}, Andreas and Poth, Clifton and Kamath, Aishwarya and Vuli{\'c}, Ivan and Ruder, Sebastian and Cho, Kyunghyun and Gurevych, Iryna},
|
|
journal={arXiv preprint arXiv:2007.07779},
|
|
year={2020}
|
|
}
|
|
@article{lialin2023scaling,
|
|
title={Scaling down to scale up: A guide to parameter-efficient fine-tuning},
|
|
author={Lialin, Vladislav and Deshpande, Vijeta and Rumshisky, Anna},
|
|
journal={arXiv preprint arXiv:2303.15647},
|
|
year={2023}
|
|
}
|
|
@article{li2021prefix,
|
|
title={Prefix-tuning: Optimizing continuous prompts for generation},
|
|
author={Li, Xiang Lisa and Liang, Percy},
|
|
journal={arXiv preprint arXiv:2101.00190},
|
|
year={2021}
|
|
}
|
|
@article{lu2023uniadapter,
|
|
title={Uniadapter: Unified parameter-efficient transfer learning for cross-modal modeling},
|
|
author={Lu, Haoyu and Huo, Yuqi and Yang, Guoxing and Lu, Zhiwu and Zhan, Wei and Tomizuka, Masayoshi and Ding, Mingyu},
|
|
journal={arXiv preprint arXiv:2302.06605},
|
|
year={2023}
|
|
}
|
|
|
|
@article{fedus2022switch,
|
|
title={Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity},
|
|
author={Fedus, William and Zoph, Barret and Shazeer, Noam},
|
|
journal={Journal of Machine Learning Research},
|
|
volume={23},
|
|
number={120},
|
|
pages={1--39},
|
|
year={2022}
|
|
}
|
|
@article{lepikhin2020gshard,
|
|
title={Gshard: Scaling giant models with conditional computation and automatic sharding},
|
|
author={Lepikhin, Dmitry and Lee, HyoukJoong and Xu, Yuanzhong and Chen, Dehao and Firat, Orhan and Huang, Yanping and Krikun, Maxim and Shazeer, Noam and Chen, Zhifeng},
|
|
journal={arXiv preprint arXiv:2006.16668},
|
|
year={2020}
|
|
}
|
|
|
|
@inproceedings{dou2024loramoe,
|
|
title={LoRAMoE: Alleviating world knowledge forgetting in large language models via MoE-style plugin},
|
|
author={Dou, Shihan and Zhou, Enyu and Liu, Yan and Gao, Songyang and Shen, Wei and Xiong, Limao and Zhou, Yuhao and Wang, Xiao and Xi, Zhiheng and Fan, Xiaoran and others},
|
|
booktitle={Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
|
|
pages={1932--1945},
|
|
year={2024}
|
|
}
|
|
@article{zhang2023adalora,
|
|
title={AdaLoRA: Adaptive budget allocation for parameter-efficient fine-tuning},
|
|
author={Zhang, Qingru and Chen, Minshuo and Bukharin, Alexander and Karampatziakis, Nikos and He, Pengcheng and Cheng, Yu and Chen, Weizhu and Zhao, Tuo},
|
|
journal={arXiv preprint arXiv:2303.10512},
|
|
year={2023}
|
|
}
|
|
@article{liu2024dora,
|
|
title={Dora: Weight-decomposed low-rank adaptation},
|
|
author={Liu, Shih-Yang and Wang, Chien-Yi and Yin, Hongxu and Molchanov, Pavlo and Wang, Yu-Chiang Frank and Cheng, Kwang-Ting and Chen, Min-Hung},
|
|
journal={arXiv preprint arXiv:2402.09353},
|
|
year={2024}
|
|
}
|
|
@article{hu2021lora,
|
|
title={Lora: Low-rank adaptation of large language models},
|
|
author={Hu, Edward J and Shen, Yelong and Wallis, Phillip and Allen-Zhu, Zeyuan and Li, Yuanzhi and Wang, Shean and Wang, Lu and Chen, Weizhu},
|
|
journal={arXiv preprint arXiv:2106.09685},
|
|
year={2021}
|
|
}
|
|
|
|
@article{achiam2023gpt,
|
|
title={Gpt-4 technical report},
|
|
author={Achiam, Josh and Adler, Steven and Agarwal, Sandhini and Ahmad, Lama and Akkaya, Ilge and Aleman, Florencia Leoni and Almeida, Diogo and Altenschmidt, Janko and Altman, Sam and Anadkat, Shyamal and others},
|
|
journal={arXiv preprint arXiv:2303.08774},
|
|
year={2023}
|
|
}
|
|
@article{jaszczur2021sparse,
|
|
title={Sparse is enough in scaling transformers},
|
|
author={Jaszczur, Sebastian and Chowdhery, Aakanksha and Mohiuddin, Afroz and Kaiser, Lukasz and Gajewski, Wojciech and Michalewski, Henryk and Kanerva, Jonni},
|
|
journal={Advances in Neural Information Processing Systems},
|
|
volume={34},
|
|
pages={9895--9907},
|
|
year={2021}
|
|
}
|
|
@inproceedings{standley2020tasks,
|
|
title={Which tasks should be learned together in multi-task learning?},
|
|
author={Standley, Trevor and Zamir, Amir and Chen, Dawn and Guibas, Leonidas and Malik, Jitendra and Savarese, Silvio},
|
|
booktitle={International conference on machine learning},
|
|
pages={9120--9132},
|
|
year={2020},
|
|
organization={PMLR}
|
|
}
|
|
@article{cai2024survey,
|
|
title={A survey on mixture of experts},
|
|
author={Cai, Weilin and Jiang, Juyong and Wang, Fan and Tang, Jing and Kim, Sunghun and Huang, Jiayi},
|
|
journal={arXiv preprint arXiv:2407.06204},
|
|
year={2024}
|
|
}
|
|
@article{karimi2021compacter,
|
|
title={Compacter: Efficient low-rank hypercomplex adapter layers},
|
|
author={Karimi Mahabadi, Rabeeh and Henderson, James and Ruder, Sebastian},
|
|
journal={Advances in Neural Information Processing Systems},
|
|
volume={34},
|
|
pages={1022--1035},
|
|
year={2021}
|
|
}
|
|
@article{bommasani2021opportunities,
|
|
title={On the opportunities and risks of foundation models},
|
|
author={Bommasani, Rishi and Hudson, Drew A and Adeli, Ehsan and Altman, Russ and Arora, Simran and von Arx, Sydney and Bernstein, Michael S and Bohg, Jeannette and Bosselut, Antoine and Brunskill, Emma and others},
|
|
journal={arXiv preprint arXiv:2108.07258},
|
|
year={2021}
|
|
}
|
|
@article{pan2024lisa,
|
|
title={LISA: Layerwise Importance Sampling for Memory-Efficient Large Language Model Fine-Tuning},
|
|
author={Pan, Rui and Liu, Xiang and Diao, Shizhe and Pi, Renjie and Zhang, Jipeng and Han, Chi and Zhang, Tong},
|
|
journal={arXiv preprint arXiv:2403.17919},
|
|
year={2024}
|
|
}
|
|
@article{feng2024mixture,
|
|
title={Mixture-of-loras: An efficient multitask tuning for large language models},
|
|
author={Feng, Wenfeng and Hao, Chuzhan and Zhang, Yuewei and Han, Yu and Wang, Hao},
|
|
journal={arXiv preprint arXiv:2403.03432},
|
|
year={2024}
|
|
}
|
|
@article{lester2021power,
|
|
title={The power of scale for parameter-efficient prompt tuning},
|
|
author={Lester, Brian and Al-Rfou, Rami and Constant, Noah},
|
|
journal={arXiv preprint arXiv:2104.08691},
|
|
year={2021}
|
|
}
|
|
@article{zhou2024lima,
|
|
title={Lima: Less is more for alignment},
|
|
author={Zhou, Chunting and Liu, Pengfei and Xu, Puxin and Iyer, Srinivasan and Sun, Jiao and Mao, Yuning and Ma, Xuezhe and Efrat, Avia and Yu, Ping and Yu, Lili and others},
|
|
journal={Advances in Neural Information Processing Systems},
|
|
volume={36},
|
|
year={2024}
|
|
}
|
|
@article{wei2021finetuned,
|
|
title={Finetuned language models are zero-shot learners},
|
|
author={Wei, Jason and Bosma, Maarten and Zhao, Vincent Y and Guu, Kelvin and Yu, Adams Wei and Lester, Brian and Du, Nan and Dai, Andrew M and Le, Quoc V},
|
|
journal={arXiv preprint arXiv:2109.01652},
|
|
year={2021}
|
|
}
|
|
|
|
@article{brynjolfsson2025generative,
|
|
title={Generative AI at work},
|
|
author={Brynjolfsson, Erik and Li, Danielle and Raymond, Lindsey},
|
|
journal={The Quarterly Journal of Economics},
|
|
pages={qjae044},
|
|
year={2025},
|
|
publisher={Oxford University Press}
|
|
}
|
|
@Misc{peft,
|
|
title = {PEFT: State-of-the-art Parameter-Efficient Fine-Tuning methods},
|
|
author = {Sourab Mangrulkar and Sylvain Gugger and Lysandre Debut and Younes Belkada and Sayak Paul and Benjamin Bossan},
|
|
howpublished = {\url{https://github.com/huggingface/peft}},
|
|
year = {2022}
|
|
}
|
|
@article{li2023chatdoctor,
|
|
title={ChatDoctor: A Medical Chat Model Fine-Tuned on a Large Language Model Meta-AI (LLaMA) Using Medical Domain Knowledge},
|
|
author={Li, Yunxiang and Li, Zihan and Zhang, Kai and Dan, Ruilong and Jiang, Steve and Zhang, You},
|
|
journal={Cureus},
|
|
volume={15},
|
|
number={6},
|
|
year={2023},
|
|
publisher={Cureus}
|
|
}
|
|
@online{DatabricksBlog2023DollyV2,
|
|
author = {Mike Conover and Matt Hayes and Ankit Mathur and Jianwei Xie and Jun Wan and Sam Shah and Ali Ghodsi and Patrick Wendell and Matei Zaharia and Reynold Xin},
|
|
title = {Free Dolly: Introducing the World's First Truly Open Instruction-Tuned LLM},
|
|
year = {2023},
|
|
url = {https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm},
|
|
urldate = {2023-06-30}
|
|
}
|
|
@inproceedings{nakano2021webgpt,
|
|
author = {Reiichiro Nakano and Jacob Hilton and Suchir Balaji and Jeff Wu and Long Ouyang and Christina Kim and Christopher Hesse and Shantanu Jain and Vineet Kosaraju and William Saunders and Xu Jiang and Karl Cobbe and Tyna Eloundou and Gretchen Krueger and Kevin Button and Matthew Knight and Benjamin Chess and John Schulman},
|
|
title = {WebGPT: Browser-assisted question-answering with human feedback},
|
|
booktitle = {arXiv},
|
|
year = 2021,
|
|
}
|
|
@inproceedings{zhang2023automatic,
|
|
title={Automatic Chain of Thought Prompting in Large Language Models},
|
|
author={Zhang, Zhuosheng and Zhang, Aston and Li, Mu and Smola, Alex},
|
|
booktitle={The Eleventh International Conference on Learning Representations (ICLR 2023)},
|
|
year={2023}
|
|
}
|
|
@misc{codealpaca,
|
|
author = {Sahil Chaudhary},
|
|
title = {Code Alpaca: An Instruction-following LLaMA model for code generation},
|
|
year = {2023},
|
|
publisher = {GitHub},
|
|
journal = {GitHub repository},
|
|
howpublished = {\url{https://github.com/sahil280114/codealpaca}},
|
|
}
|
|
@article{zhao2024hypermoe,
|
|
title={HyperMoE: Towards Better Mixture of Experts via Transferring Among Experts},
|
|
author={Zhao, Hao and Qiu, Zihan and Wu, Huijia and Wang, Zili and He, Zhaofeng and Fu, Jie},
|
|
journal={arXiv preprint arXiv:2402.12656},
|
|
year={2024}
|
|
}
|