simhashUTF

package
v2.0.0-...-581a106 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jul 3, 2017 License: MIT Imports: 4 Imported by: 0

Documentation

Overview

simhashUTF -- simhash language-specific handling for UTF.

This package is to refactor the Unicode handling code from the original (v1) design out to this thin language handling layer, which showcases how easy it is to extend the simhash's language-specific handling functionality.

Such modular approach (v2 design) helps to reduce and limit the size of the core code, while make it easy to extend the core function as well.

Example (Output)

for standalone test, change package to `main` and the next func def to, func main() {

// package main

package main

import (
	"fmt"

	"github.com/go-dedup/simhash"
	"github.com/go-dedup/simhash/simhashUTF"
	"golang.org/x/text/unicode/norm"
)

// for standalone test, change package to `main` and the next func def to,
// func main() {
func main() {
	hashes := make([]uint64, len(docs))
	sh := simhashUTF.NewUTFSimhash(norm.NFKC)
	for i, d := range docs {
		hashes[i] = sh.GetSimhash(sh.NewWordFeatureSet(d))
		fmt.Printf("Simhash of '%s': %x\n", d, hashes[i])
	}

	fmt.Printf("Comparison of `%s` and `%s`: %d\n", docs[0], docs[1], simhash.Compare(hashes[0], hashes[1]))
	fmt.Printf("Comparison of `%s` and `%s`: %d\n", docs[0], docs[2], simhash.Compare(hashes[0], hashes[2]))
	fmt.Printf("Comparison of `%s` and `%s`: %d\n", docs[0], docs[3], simhash.Compare(hashes[0], hashes[3]))

}

var docs = [][]byte{
	[]byte("la fin d'un bel après-midi d'été"),
	[]byte("bonne après-midi"),
	[]byte("Bonjour"),
	[]byte("Bonsoir"),
}
Output:

Simhash of 'la fin d'un bel après-midi d'été': 58dbbd1fefab774a
Simhash of 'bonne après-midi': fadfbfbfdf8e7b7f
Simhash of 'Bonjour': ac5261af4fdd5252
Simhash of 'Bonsoir': fb42ceaf7cda4905
Comparison of `la fin d'un bel après-midi d'été` and `bonne après-midi`: 18
Comparison of `la fin d'un bel après-midi d'été` and `Bonjour`: 28
Comparison of `la fin d'un bel après-midi d'été` and `Bonsoir`: 34

Index

Examples

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type SimhashUTF

type SimhashUTF struct {
	simhash.SimhashBase
	// contains filtered or unexported fields
}

func NewSimhash

func NewSimhash() *SimhashUTF

NewSimhash makes a new Simhash

func NewUTFSimhash

func NewUTFSimhash(_f norm.Form) *SimhashUTF

NewUTFSimhash makes a new SimhashUTF

func (*SimhashUTF) NewUnicodeWordFeatureSet

func (st *SimhashUTF) NewUnicodeWordFeatureSet(b []byte, f norm.Form) *UnicodeWordFeatureSet
Example (InChinese)
sh := NewSimhash()
text := []byte("当山峰没有棱角的时候")
fs := sh.NewUnicodeWordFeatureSet(text, norm.NFKC)
fmt.Printf("%#v\n", fs)
actual := fs.GetFeatures()
fmt.Printf("%#v\n", actual)
Output:

&simhashUTF.UnicodeWordFeatureSet{WordFeatureSet:simhash.WordFeatureSet{B:[]uint8{0xe5, 0xbd, 0x93, 0xe5, 0xb1, 0xb1, 0xe5, 0xb3, 0xb0, 0xe6, 0xb2, 0xa1, 0xe6, 0x9c, 0x89, 0xe6, 0xa3, 0xb1, 0xe8, 0xa7, 0x92, 0xe7, 0x9a, 0x84, 0xe6, 0x97, 0xb6, 0xe5, 0x80, 0x99}}, f:2}
[]simhash.Feature{simhash.feature{sum:0xa5edea16c0c7a180, weight:1}}
Example (InWestern)
sh := NewSimhash()
text := []byte("la fin d'un bel après-midi d'été")
fs := sh.NewUnicodeWordFeatureSet(text, norm.NFKC)
fmt.Printf("%#v\n", fs)
actual := fs.GetFeatures()
fmt.Printf("%#v\n", actual)
Output:

&simhashUTF.UnicodeWordFeatureSet{WordFeatureSet:simhash.WordFeatureSet{B:[]uint8{0x6c, 0x61, 0x20, 0x66, 0x69, 0x6e, 0x20, 0x64, 0x27, 0x75, 0x6e, 0x20, 0x62, 0x65, 0x6c, 0x20, 0x61, 0x70, 0x72, 0xc3, 0xa8, 0x73, 0x2d, 0x6d, 0x69, 0x64, 0x69, 0x20, 0x64, 0x27, 0xc3, 0xa9, 0x74, 0xc3, 0xa9}}, f:2}
[]simhash.Feature{simhash.feature{sum:0x8325c07b4eb2548, weight:1}, simhash.feature{sum:0xd8cbc5186ba13198, weight:1}, simhash.feature{sum:0x15cdbd7eed98cfab, weight:1}, simhash.feature{sum:0xd8d9a1186bad324a, weight:1}, simhash.feature{sum:0x3adb901f8c8a7b5e, weight:1}, simhash.feature{sum:0x7e8f29c36ffb774e, weight:1}}

func (*SimhashUTF) NewWordFeatureSet

func (st *SimhashUTF) NewWordFeatureSet(b []byte) *UnicodeWordFeatureSet

type UnicodeWordFeatureSet

type UnicodeWordFeatureSet struct {
	simhash.WordFeatureSet
	// contains filtered or unexported fields
}

UnicodeWordFeatureSet is a feature set in which each word is a feature, all equal weight.

See: http://blog.golang.org/normalization See: https://groups.google.com/forum/#!topic/golang-nuts/YyH1f_qCZVc

func (*UnicodeWordFeatureSet) GetFeatures

func (w *UnicodeWordFeatureSet) GetFeatures() []simhash.Feature

Returns a []Feature representing each word in the byte slice

func (*UnicodeWordFeatureSet) Normalize

func (w *UnicodeWordFeatureSet) Normalize()

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL