File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -794,20 +794,39 @@ fn estimate_tokens_from_text(text: &str) -> Option<i64> {
794794 return None ;
795795 }
796796
797- let mut estimate = 0_i64 ;
797+ let mut ascii_chars = 0.0_f64 ;
798+ let mut cjk_chars = 0.0_f64 ;
799+ let mut spaces = 0.0_f64 ;
800+
798801 for ch in trimmed. chars ( ) {
799- if ch. is_ascii_whitespace ( ) {
800- continue ;
801- }
802- if ch. is_ascii ( ) {
803- estimate += 1 ;
802+ if ch. is_whitespace ( ) {
803+ spaces += 1.0 ;
804+ } else if is_cjk_character ( ch) {
805+ cjk_chars += 1.0 ;
806+ } else if ch. is_ascii ( ) {
807+ ascii_chars += 1.0 ;
804808 } else {
805- estimate += 2 ;
809+ ascii_chars += 1.0 ;
806810 }
807811 }
808812
809- let estimated_tokens = ( ( estimate + 3 ) / 4 ) . max ( 1 ) ;
810- Some ( estimated_tokens)
813+ let estimated_tokens = ( 0.28 * ascii_chars + 1.4 * cjk_chars + 0.15 * spaces + 4.0 ) . ceil ( ) as i64 ;
814+ Some ( estimated_tokens. max ( 1 ) )
815+ }
816+
817+ fn is_cjk_character ( ch : char ) -> bool {
818+ matches ! (
819+ ch as u32 ,
820+ 0x3400 ..=0x4DBF
821+ | 0x4E00 ..=0x9FFF
822+ | 0xF900 ..=0xFAFF
823+ | 0x20000 ..=0x2A6DF
824+ | 0x2A700 ..=0x2B73F
825+ | 0x2B740 ..=0x2B81F
826+ | 0x2B820 ..=0x2CEAF
827+ | 0x2CEB0 ..=0x2EBEF
828+ | 0x30000 ..=0x3134F
829+ )
811830}
812831
813832async fn collect_stream_bytes (
You can’t perform that action at this time.
0 commit comments