1.fix all table size;2.fix some table max/2nd-max mark error;3.add fig & table caption mark;4.add some ref bibs
This commit is contained in:
98
ref.bib
98
ref.bib
@@ -2563,3 +2563,101 @@ LargeST(引过了)
|
||||
archivePrefix = {arXiv},
|
||||
year = {2024}
|
||||
}
|
||||
|
||||
@inproceedings{bogoychev2021not,
|
||||
title={Not all parameters are born equal: Attention is mostly what you need},
|
||||
author={Bogoychev, Nikolay},
|
||||
booktitle={Proceedings of the fourth blackboxnlp workshop on analyzing and interpreting neural networks for nlp},
|
||||
pages={363--374},
|
||||
year={2021}
|
||||
}
|
||||
|
||||
@article{olsson2022context,
|
||||
title={In-context learning and induction heads},
|
||||
author={Olsson, Catherine and Elhage, Nelson and Nanda, Neel and Joseph, Nicholas and DasSarma, Nova and Henighan, Tom and Mann, Ben and Askell, Amanda and Bai, Yuntao and Chen, Anna and others},
|
||||
journal={arXiv preprint arXiv:2209.11895},
|
||||
year={2022}
|
||||
}
|
||||
|
||||
@inproceedings{rahaman2019spectral,
|
||||
title={On the spectral bias of neural networks},
|
||||
author={Rahaman, Nasim and Baratin, Aristide and Arpit, Devansh and Draxler, Felix and Lin, Min and Hamprecht, Fred and Bengio, Yoshua and Courville, Aaron},
|
||||
booktitle={International conference on machine learning},
|
||||
pages={5301--5310},
|
||||
year={2019},
|
||||
organization={PMLR}
|
||||
}
|
||||
|
||||
@inproceedings{liu2022p,
|
||||
title={P-tuning: Prompt tuning can be comparable to fine-tuning across scales and tasks},
|
||||
author={Liu, Xiao and Ji, Kaixuan and Fu, Yicheng and Tam, Weng and Du, Zhengxiao and Yang, Zhilin and Tang, Jie},
|
||||
booktitle={Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)},
|
||||
pages={61--68},
|
||||
year={2022}
|
||||
}
|
||||
|
||||
@article{dettmers2023qlora,
|
||||
title={Qlora: Efficient finetuning of quantized llms},
|
||||
author={Dettmers, Tim and Pagnoni, Artidoro and Holtzman, Ari and Zettlemoyer, Luke},
|
||||
journal={Advances in neural information processing systems},
|
||||
volume={36},
|
||||
pages={10088--10115},
|
||||
year={2023}
|
||||
}
|
||||
|
||||
@article{caruana1997multitask,
|
||||
title={Multitask learning},
|
||||
author={Caruana, Rich},
|
||||
journal={Machine learning},
|
||||
volume={28},
|
||||
number={1},
|
||||
pages={41--75},
|
||||
year={1997},
|
||||
publisher={Springer}
|
||||
}
|
||||
|
||||
@inproceedings{chen2018gradnorm,
|
||||
title={Gradnorm: Gradient normalization for adaptive loss balancing in deep multitask networks},
|
||||
author={Chen, Zhao and Badrinarayanan, Vijay and Lee, Chen-Yu and Rabinovich, Andrew},
|
||||
booktitle={International conference on machine learning},
|
||||
pages={794--803},
|
||||
year={2018},
|
||||
organization={PMLR}
|
||||
}
|
||||
|
||||
@inproceedings{godey2024anisotropy,
|
||||
title={Anisotropy is inherent to self-attention in transformers},
|
||||
author={Godey, Nathan and Clergerie, {\'E}ric and Sagot, Beno{\^\i}t},
|
||||
booktitle={Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)},
|
||||
pages={35--48},
|
||||
year={2024}
|
||||
}
|
||||
|
||||
@article{liu2018darts,
|
||||
title={Darts: Differentiable architecture search},
|
||||
author={Liu, Hanxiao and Simonyan, Karen and Yang, Yiming},
|
||||
journal={arXiv preprint arXiv:1806.09055},
|
||||
year={2018}
|
||||
}
|
||||
|
||||
@article{frankle2019stabilizing,
|
||||
title={Stabilizing the lottery ticket hypothesis},
|
||||
author={Frankle, Jonathan and Dziugaite, Gintare Karolina and Roy, Daniel M and Carbin, Michael},
|
||||
journal={arXiv preprint arXiv:1903.01611},
|
||||
year={2019}
|
||||
}
|
||||
|
||||
@article{ilharco2022editing,
|
||||
title={Editing models with task arithmetic},
|
||||
author={Ilharco, Gabriel and Ribeiro, Marco Tulio and Wortsman, Mitchell and Gururangan, Suchin and Schmidt, Ludwig and Hajishirzi, Hannaneh and Farhadi, Ali},
|
||||
journal={arXiv preprint arXiv:2212.04089},
|
||||
year={2022}
|
||||
}
|
||||
|
||||
@article{devlin2018bert,
|
||||
title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
|
||||
author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
|
||||
journal={arXiv preprint arXiv:1810.04805},
|
||||
year={2018}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user