tokenizer

package module
v0.0.0-...-33467e6 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 10, 2023 License: MIT Imports: 8 Imported by: 2

README

tokenizer

Go Reference

NLP tokenizers in Go.

Installation

$ go get -u github.com/go-aie/tokenizer

Documentation

Check out the documentation.

License

MIT

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func NewWordLevel

func NewWordLevel(vocab map[string]int, unkToken string) *wordlevel.WordLevel

NewWordLevel creates a WordLevel model from a given vocab.

Types

type RuneLevel

type RuneLevel struct {
	*wordlevel.WordLevel
	// contains filtered or unexported fields
}

RuneLevel is a model tokenizer that splits each word into runes and maps runes to IDs.

func NewRuneLevel

func NewRuneLevel(vocab RuneLevelVocab) *RuneLevel

func (*RuneLevel) Tokenize

func (rl *RuneLevel) Tokenize(token string) ([]tokenizer.Token, error)

Tokenize transforms given input token into a list of rune-level sub-tokens.

type RuneLevelVocab

type RuneLevelVocab interface {
	Vocab() map[string]int
	UnkToken() string
	TokenToID(token string) (int, error)
}

type Tokenizer

type Tokenizer struct {
	*tokenizer.Tokenizer
}

func (*Tokenizer) EncodeBatchSerially

func (t *Tokenizer) EncodeBatchSerially(inputs []tokenizer.EncodeInput, addSpecialTokens bool) ([]tokenizer.Encoding, error)

EncodeBatchSerially encodes all sentences serially.

func (*Tokenizer) EncodeBatchTexts

func (t *Tokenizer) EncodeBatchTexts(texts []string, addSpecialTokens bool) ([]tokenizer.Encoding, error)

type Vocab

type Vocab[T constraints.Integer] struct {
	// contains filtered or unexported fields
}

func NewVocabFromFile

func NewVocabFromFile[T constraints.Integer](filename, separator, unkToken string) (*Vocab[T], error)

func NewVocabFromSlice

func NewVocabFromSlice[T constraints.Integer](lines []string, separator, unkToken string) (*Vocab[T], error)

func (*Vocab[T]) IDToToken

func (v *Vocab[T]) IDToToken(id T) (string, error)

func (*Vocab[T]) IDsToTokens

func (v *Vocab[T]) IDsToTokens(ids []T) (tokens []string, err error)

func (*Vocab[T]) TokenToID

func (v *Vocab[T]) TokenToID(token string) (T, error)

func (*Vocab[T]) TokensToIDs

func (v *Vocab[T]) TokensToIDs(tokens []string) (ids []T, err error)

func (*Vocab[T]) UnkToken

func (v *Vocab[T]) UnkToken() string

func (*Vocab[T]) Vocab

func (v *Vocab[T]) Vocab() map[string]T

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL