@article{yang2024tidaldecodefastaccuratellm,title={TidalDecode: Fast and Accurate LLM Decoding with Position Persistent Sparse Attention},author={Yang, Lijie and Zhang, Zhihao and Chen, Zhuofu and Li, Zikun and Jia, Zhihao},year={2025},eprint={2410.05076},journal={to appear at International Conference on Learning Representations},primaryclass={cs.LG},url={https://arxiv.org/abs/2410.05076},}
2024
LCN
Blocking-Waived Estimation: Improving the Worst-Case End-To-End Delay Analysis in Switched Ethernet
Lijie Yang, Théo Docquier, Ludovic Thomas, and Ye-Qiong Song
@article{lcn2024bwe,title={Blocking-Waived Estimation: Improving the Worst-Case End-To-End Delay Analysis in Switched Ethernet},author={Yang, Lijie and Docquier, Théo and Thomas, Ludovic and Song, Ye-Qiong},journal={In Proceedings of Local Computer Networks},year={2024},}
ICML
Accelerating Retrieval-augmented Language Model Serving with Speculation
Zhihao Zhang, Alan Zhu, Lijie Yang, Yihua Xu, Lanting Li, Phitchaya Mangpo Phothilimthana, and Zhihao Jia
In Proceedings of International Conference on Machine Learning, 2024
@article{zhang2024accelerating,title={Accelerating Retrieval-augmented Language Model Serving with Speculation},author={Zhang, Zhihao and Zhu, Alan and Yang, Lijie and Xu, Yihua and Li, Lanting and Phothilimthana, Phitchaya Mangpo and Jia, Zhihao},journal={In Proceedings of International Conference on Machine Learning},year={2024},}
ASPLOS
SpecInfer: Accelerating Large Language Model Serving with Tree-based Speculative Inference and Verification
Xupeng Miao*, Gabriele Oliaro*, Zhihao Zhang*, Xinhao Cheng*, Zeyu Wang, Zhengxin Zhang, Rae Ying Yee Wong, Alan Zhu, Lijie Yang, and 6 more authors
In Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 3, Apr 2024
@inproceedings{Miao_2024,series={ASPLOS ’24},title={SpecInfer: Accelerating Large Language Model Serving with Tree-based Speculative Inference and Verification},url={http://dx.doi.org/10.1145/3620666.3651335},doi={10.1145/3620666.3651335},booktitle={Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 3},publisher={ACM},author={Miao, Xupeng and Oliaro, Gabriele and Zhang, Zhihao and Cheng, Xinhao and Wang, Zeyu and Zhang, Zhengxin and Wong, Rae Ying Yee and Zhu, Alan and Yang, Lijie and Shi, Xiaoxiang and Shi, Chunan and Chen, Zhuoming and Arfeen, Daiyaan and Abhyankar, Reyna and Jia, Zhihao},year={2024},month=apr,collection={ASPLOS ’24},}
2023
HAL
Technical Report: Worst-case Delay Analysis: a Simulation-based Comparison between Flow Aggregation and CPA