nlp

package
v0.0.0-...-3ddf8cf Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 5, 2024 License: Apache-2.0 Imports: 12 Imported by: 0

Documentation

Index

Constants

View Source
const (
	NumToken        = "_num_"
	PercentToken    = "_percent_"
	DateToken       = "_date_"
	MathToken       = "_math_"
	HieroglyphToken = "_hieroglyph_" //nolint:gosec // This is a reference to egyptian hieroglyphs.
)

Tokens to replace longer sequences with, that we treat as semantically identical for analysis.

View Source
const (
	// Months are all the months of the year.
	Months = "(january|february|march|april|may|june|july|august|september|october|november|december)"
)

Variables

View Source
var (

	// CommentRegex matches commented-out text. Such text is not shown on pages
	// and is generally either off-topic or low quality.
	//
	// Obviously not perfect and can match non-comments in rare cases.
	CommentRegex = regexp.MustCompile("(?s)<!--.*?-->")

	IgnoredTagsRegex     = regexp.MustCompile(fmt.Sprintf(`(?i)</?(%s).*?>`, strings.Join(ignoredTags(), "|")))
	TimelineRegex        = regexp.MustCompile(`(?is)<timeline.*?</timeline[\w\s]*>`)
	GalleryRegex         = regexp.MustCompile(`(?is)<gallery.*?</gallery[\w\s]*>`)
	GraphRegex           = regexp.MustCompile(`(?is)<graph.*?</graph[\w\s]*>`)
	ImageMapRegex        = regexp.MustCompile(`(?is)<imagemap.*?</imagemap[\w\s]*>`)
	MathRegex            = regexp.MustCompile(`(?is)<math.*?</math[\w\s]*>`)
	CodeRegex            = regexp.MustCompile(`(?is)<code.*?</code[\w\s]*>`)
	CiteRegex            = regexp.MustCompile(`(?is)<cite.*?</cite[\w\s]*>`)
	ChemRegex            = regexp.MustCompile(`(?is)<chem.*?</chem[\w\s]*>`)
	PoemRegex            = regexp.MustCompile(`(?is)<poem.*?</poem[\w\s]*>`)
	HieroglyphRegex      = regexp.MustCompile(`(?is)<hiero.*?</hiero[\w\s]*>`)
	MapframeRegex        = regexp.MustCompile(`(?is)<mapframe.*?</mapframe[\w\s]*>`)
	DelRegex             = regexp.MustCompile(`(?is)<del.*?</del[\w\s]*>`)
	SyntaxHighlightRegex = regexp.MustCompile(`(?is)<syntaxhighlight.*?</syntaxhighlight[\w\s]*>`)
	PreRegex             = regexp.MustCompile(`(?is)<pre.*?</pre[\w\s]*>`)
	TableRegex           = regexp.MustCompile(`(?is)<table.*?</table[\w\s]*>`)
	TableRegex2          = regexp.MustCompile(`(?s)({\||{{).*?\n\|}`)
	BrRegex              = regexp.MustCompile(`(?i)<(p|br|hr).*?>`)

	AlteredQuote = regexp.MustCompile(`\[([a-zA-Z])]`)

	RemoveLinks = regexp.MustCompile(`\[\[(:?Category:|List of)[^]]+]]`)

	WikipediaLinks = regexp.MustCompile(`\[\[([^[\]]+\|)?([^[|]+?)]]`)

	RefRegex = regexp.MustCompile(`(?s)<ref.*?(>.*?</ref>| ?/>)`)
)

Regular expressions for cleaning Wikipedia articles of XML tags and formatting.

View Source
var (
	WordRegex = regexp.MustCompile(`[\w']+`)

	NumberRegex  = regexp.MustCompile(`\b\d+(,\d{3})*(\.\d+)?\b`)
	PercentRegex = regexp.MustCompile(fmt.Sprintf(`%s%%`, NumToken))
	DateRegex    = regexp.MustCompile(fmt.Sprintf(`(?i)\b(%s (%s,? )?%s|%s %s,? %s)\b`,
		NumToken, Months, NumToken,
		Months, NumToken, NumToken,
	))
)

Regular expressions for detecting semantically-similar sequences.

View Source
var File_pkg_nlp_nlp_proto protoreflect.FileDescriptor
View Source
var XMLTagRegex = regexp.MustCompile(`<[a-z][a-z0-9]+`)

XMLTagRegex tries to find XML tags which are still present in the corpus. Useful for finding problematic tags that we want to avoid.

Functions

func CleanArticle

func CleanArticle(text string) string

CleanArticle2 removes all parts of Wikipedia we never want to analyze.

func Normalize

func Normalize(w string) string

func NormalizeArticle

func NormalizeArticle(text string) string

func ToNgramDictionary

func ToNgramDictionary(dictionary *Dictionary) map[string]bool

Types

type Counter

type Counter struct {
	Tokenizer
}

func (Counter) Count

func (c Counter) Count(s string) map[string]int

type Dictionary

type Dictionary struct {

	// Words is a list of recognized words, in the order they appear in a frequency table.
	Words []string `protobuf:"bytes,1,rep,name=words,proto3" json:"words,omitempty"`
	// contains filtered or unexported fields
}

Dictionary is a set of known words.

func ReadDictionary

func ReadDictionary(path string) (*Dictionary, error)

ReadDictionary reads a Dictionary proto from a file. Returns an empty dictionary if path is the empty string.

func (*Dictionary) Descriptor deprecated

func (*Dictionary) Descriptor() ([]byte, []int)

Deprecated: Use Dictionary.ProtoReflect.Descriptor instead.

func (*Dictionary) GetWords

func (x *Dictionary) GetWords() []string

func (*Dictionary) ProtoMessage

func (*Dictionary) ProtoMessage()

func (*Dictionary) ProtoReflect

func (x *Dictionary) ProtoReflect() protoreflect.Message

func (*Dictionary) Reset

func (x *Dictionary) Reset()

func (*Dictionary) String

func (x *Dictionary) String() string

type FrequencyMap

type FrequencyMap struct {
	Words map[string]uint32 `` /* 152-byte string literal not displayed */
	// contains filtered or unexported fields
}

FrequencyMap is a set of known words and their frequencies.

func (*FrequencyMap) Descriptor deprecated

func (*FrequencyMap) Descriptor() ([]byte, []int)

Deprecated: Use FrequencyMap.ProtoReflect.Descriptor instead.

func (*FrequencyMap) GetWords

func (x *FrequencyMap) GetWords() map[string]uint32

func (*FrequencyMap) ProtoMessage

func (*FrequencyMap) ProtoMessage()

func (*FrequencyMap) ProtoReflect

func (x *FrequencyMap) ProtoReflect() protoreflect.Message

func (*FrequencyMap) Reset

func (x *FrequencyMap) Reset()

func (*FrequencyMap) String

func (x *FrequencyMap) String() string

type FrequencyTable

type FrequencyTable struct {
	Words []*WordCount `protobuf:"bytes,1,rep,name=words,proto3" json:"words,omitempty"`
	// contains filtered or unexported fields
}

FrequencyMap is a set of known words and their frequencies.

func ToFrequencyTable

func ToFrequencyTable(m *FrequencyMap) *FrequencyTable

func (*FrequencyTable) Descriptor deprecated

func (*FrequencyTable) Descriptor() ([]byte, []int)

Deprecated: Use FrequencyTable.ProtoReflect.Descriptor instead.

func (*FrequencyTable) GetWords

func (x *FrequencyTable) GetWords() []*WordCount

func (*FrequencyTable) ProtoMessage

func (*FrequencyTable) ProtoMessage()

func (*FrequencyTable) ProtoReflect

func (x *FrequencyTable) ProtoReflect() protoreflect.Message

func (*FrequencyTable) Reset

func (x *FrequencyTable) Reset()

func (*FrequencyTable) Sort

func (x *FrequencyTable) Sort()

func (*FrequencyTable) String

func (x *FrequencyTable) String() string

type NgramTokenizer

type NgramTokenizer struct {
	Underlying WordTokenizer

	Dictionary map[string]bool
}

func (NgramTokenizer) Tokenize

func (t NgramTokenizer) Tokenize(s string) []string

type Tokenizer

type Tokenizer interface {
	// Tokenize splits s into distinct tokens.
	Tokenize(s string) []string
}

type WordCount

type WordCount struct {
	Word  string `protobuf:"bytes,1,opt,name=word,proto3" json:"word,omitempty"`
	Count uint32 `protobuf:"varint,2,opt,name=count,proto3" json:"count,omitempty"`
	// contains filtered or unexported fields
}

func (*WordCount) Descriptor deprecated

func (*WordCount) Descriptor() ([]byte, []int)

Deprecated: Use WordCount.ProtoReflect.Descriptor instead.

func (*WordCount) GetCount

func (x *WordCount) GetCount() uint32

func (*WordCount) GetWord

func (x *WordCount) GetWord() string

func (*WordCount) ProtoMessage

func (*WordCount) ProtoMessage()

func (*WordCount) ProtoReflect

func (x *WordCount) ProtoReflect() protoreflect.Message

func (*WordCount) Reset

func (x *WordCount) Reset()

func (*WordCount) String

func (x *WordCount) String() string

type WordTokenizer

type WordTokenizer struct{}

func (WordTokenizer) Tokenize

func (t WordTokenizer) Tokenize(s string) []string

type XMLTokenizer

type XMLTokenizer struct{}

func (XMLTokenizer) Tokenize

func (x XMLTokenizer) Tokenize(s string) []string

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL