
    |Mg%                     n    d Z ddlmZ ddlZddlZddlZ G d d          Z	 dddZ	 dddZddZ	d Z
dS )zJThis is an educational implementation of the byte pair encoding algorithm.    )annotationsNc                  f    e Zd ZddZdddZddZddZddZed d            Z	ed             Z
dS )!SimpleBytePairEncodingpat_strstrmergeable_ranksdict[bytes, int]returnNonec                   || _         || _        d |                                D             | _        t	          j        |          | _        dS )zCreates an Encoding object.c                    i | ]\  }}||	S  r   ).0token_bytestokens      G/var/www/api/venv/lib/python3.11/site-packages/tiktoken/_educational.py
<dictcomp>z3SimpleBytePairEncoding.__init__.<locals>.<dictcomp>   s    ^^^0BU^^^    N)r   r   items_decoderregexcompile_pat)selfr   r   s      r   __init__zSimpleBytePairEncoding.__init__   sI     .^^oF[F[F]F]^^^M'**			r   colourtext	visualise
str | None	list[int]c                    | j                             |          }g }|D ]C}|                    d          }t          | j        ||          }|                    |           D|S )z`Encodes a string into tokens.

        >>> enc.encode("hello world")
        [388, 372]
        utf-8)r   )r   findallencode
bpe_encoder   extend)r   r   r   wordstokensword
word_bytesword_tokenss           r   r$   zSimpleBytePairEncoding.encode   sp     	!!$'' 	' 	'DW--J$T%9:QZ[[[KMM+&&&&r   r(   bytesc                F     d                      fd|D                       S )znDecodes a list of tokens into bytes.

        >>> enc.decode_bytes([388, 372])
        b'hello world'
        r   c              3  2   K   | ]}j         |         V  d S Nr   r   r   r   s     r   	<genexpr>z6SimpleBytePairEncoding.decode_bytes.<locals>.<genexpr>-   s*      AAe,AAAAAAr   )joinr   r(   s   ` r   decode_bytesz#SimpleBytePairEncoding.decode_bytes'   s,     xxAAAA&AAAAAAr   c                V    |                      |                              dd          S )u   Decodes a list of tokens into a string.

        Decoded bytes are not guaranteed to be valid UTF-8. In that case, we replace
        the invalid bytes with the replacement character "�".

        >>> enc.decode([388, 372])
        'hello world'
        r"   replaceerrors)r5   decoder4   s     r   r:   zSimpleBytePairEncoding.decode/   s+       ((//	/JJJr   list[bytes]c                       fd|D             S )zDecodes a list of tokens into a list of bytes.

        Useful for visualising how a string is tokenised.

        >>> enc.decode_tokens_bytes([388, 372])
        [b'hello', b' world']
        c                *    g | ]}j         |         S r   r0   r1   s     r   
<listcomp>z>SimpleBytePairEncoding.decode_tokens_bytes.<locals>.<listcomp>B   s     999e$999r   r   r4   s   ` r   decode_tokens_bytesz*SimpleBytePairEncoding.decode_tokens_bytes:   s     :999&9999r   training_data
vocab_sizeintc                H    t          | ||          }t          ||          S )z#Train a BPE tokeniser on some data!)datarA   r   r   r   )	bpe_trainr   )r@   rA   r   r   s       r   trainzSimpleBytePairEncoding.trainD   s,     $:W^___%gWWWWr   c                    t          | t                    rt          j        |           } t	          | j        | j                  S )NrE   )
isinstancer   tiktokenget_encodingr   _pat_str_mergeable_ranks)encodings    r   from_tiktokenz$SimpleBytePairEncoding.from_tiktokenJ   sF    h$$ 	7,X66H%%x7P
 
 
 	
r   N)r   r   r   r	   r
   r   r   )r   r   r   r   r
   r    )r(   r    r
   r,   )r(   r    r
   r   )r(   r    r
   r;   )r@   r   rA   rB   r   r   )__name__
__module____qualname__r   r$   r5   r:   r?   staticmethodrG   rO   r   r   r   r   r      s        + + + +     B B B B	K 	K 	K 	K: : : : X X X \X
 
 
 \
 
 
r   r   r   r   r	   inputr,   r   r   r
   r    c                    d |D             }	 |r)|dv rt          |           n|dk    rt          |           d }d }t          t          |d d         |dd                              D ]7\  }}                     |d         |d         z             }||||k     r|}|}8|n3|J |d |         ||         ||dz            z   gz   ||dz   d          z   }|rt                        fd	|D             }	|	S )
Nc                .    g | ]}t          |g          S r   r,   r   bs     r   r>   zbpe_encode.<locals>.<listcomp>V   s     '''AUA3ZZ'''r   Tr   colorsimple   r      c                     g | ]
}|         S r   r   )r   partr   s     r   r>   zbpe_encode.<locals>.<listcomp>s   s    666od#666r   )visualise_tokensprint	enumeratezipget)
r   rU   r   partsmin_idxmin_rankipairrankr(   s
   `         r   r%   r%   S   sf    (''''E_ 	/// ''''h&&e  U3B3Zqrr!;!;<< 	  	 GAt"&&tAwa'899DX%5 """ hwh5>E'A+4F#F"GG%PWZ[P[P]P]J^^/_2  6666666FMr   rD   r   rA   rB   r   c                .   |dk     rt          d          i }t          d          D ]}||t          |g          <   d t          j        ||           D             }t          |          |k     r$t          j                    |D ]5}t          |d d         |dd                    D ]}|xx         dz  cc<   6t          fd          }	|	d         |	d         z   }
t          |          }|||
<   g }|D ]}g }d}|t          |          dz
  k     rh||         ||dz            f|	k    r|
                    |
           |d	z  }n |
                    ||                    |dz  }|t          |          dz
  k     h|t          |          dz
  k    r|
                    ||                    |
                    |           |}|rt          d
|	d          d|	d                     t          d|
 dt          |           d           |dv r1t          d           t          d |d d         D                        n1|dk    r+t          d           |d d         D ]}t          |           t          d           t          |          |k     $|S )N   z;vocab_size must be at least 256, so we can encode all bytesc                L    g | ]!}d  |                     d          D             "S )c                .    g | ]}t          |g          S r   rX   rY   s     r   r>   z(bpe_train.<locals>.<listcomp>.<listcomp>   s     222s222r   r"   )r$   )r   r)   s     r   r>   zbpe_train.<locals>.<listcomp>   s@          7;22T[[11222     r   r^   r_   c                    |          S r/   r   )xstatss    r   <lambda>zbpe_train.<locals>.<lambda>   s    E!H r   )keyr   r`   z The current most common pair is z + zSo we made z our zth tokenr[   z9Now the first fifty words in our training data look like:c                    g | ]	}|D ]}|
S r   r   )r   r)   r   s      r   r>   zbpe_train.<locals>.<listcomp>   s%    !Q!Q!QDD!Q!Q5%!Q!Q!Q!Qr   2   r]   z:Now the first twenty words in our training data look like:   
)
ValueErrorranger,   r   r#   lencollectionsCounterrf   maxappendrd   rc   )rD   rA   r   r   ranksrk   r'   piecerl   most_common_pairr   r   	new_wordsr)   new_wordrt   s                  @r   rF   rF   w   s,    DVWWWE4[[  eQCjj   ?D}WVZ?[?[     E
 e**z
!
!#%% 	! 	!EE#2#Jabb	22 ! !dq ! u*<*<*<*<===&q),<Q,??E

"k 	 	' 	'DHAc$ii!m##GT!a%[)-===OOK000FAAOODG,,,FA c$ii!m## CIIM!!Q(((X&&&&  
	b5Ea5HbbM]^_M`bbcccFFF#e**FFFGGG///QRRR !Q!QE#2#J!Q!Q!QRRRRh&&RSSS!#2#J    D$KKKK$KKKW e**z
!
!Z Lr   token_valuesr;   r   c                8   d dD             }d | D             }d}d }|D ]k}||t          |          z           }||k    r#||dz   t          |          z           }||k    sJ |}|t          |          z  }t          ||z   d           lt          d           d S )	Nc                    g | ]}d | d	S )z[48;5;mr   )r   rk   s     r   r>   z$visualise_tokens.<locals>.<listcomp>   s$    PPP!%%%%PPPr   )         M   P   D      c                <    g | ]}|                     d d          S )r"   r7   r8   )r:   )r   rs   s     r   r>   z$visualise_tokens.<locals>.<listcomp>   s(    VVVAAHHWYH??VVVr   r   r_    )endz[0m)r}   rd   )r   
backgroundunicode_token_valuesrunning_length
last_colorr   r\   s          r   rc   rc      s    PP/OPPPJ WVVVVNJ% % %>C
OO;<J 2c*ooEFEJ&&&&
#e**$eem$$$$$	+r   c                    d} t          t                    5 }|                                }d d d            n# 1 swxY w Y   t                              |d|           }t          d           |                    d          }|                    |          dk    sJ |                    |          dk    sJ |	                    |          ddgk    sJ |S )	NzN's|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+iX  )rA   r   zJThis is the sequence of merges performed in order to encode 'hello world':zhello worlds   hello worlds   hellos    world)
open__file__readr   rG   rd   r$   r:   r5   r?   )gpt2_patternfrD   encr(   s        r   train_simple_encodingr      s   ]  
h 1vvxx               !
&
&t\
&
R
RC	
VWWWZZ&&F::f....F##~5555""6**x.CCCCCJs   8<<rP   )r   r	   rU   r,   r   r   r
   r    )
rD   r   rA   rB   r   r   r   r   r
   r	   )r   r;   r
   r   )__doc__
__future__r   r~   r   rJ   r   r%   rF   rc   r   r   r   r   <module>r      s    P P " " " " " "      D
 D
 D
 D
 D
 D
 D
 D
P NV! ! ! ! !J GOB B B B BJ   (    r   