goAcq

package module
v0.0.0-...-1ab6450 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Oct 13, 2014 License: Apache-2.0 Imports: 18 Imported by: 0

README

goAcquisition

goAcquisition是基于Go语言开发的网页内容采集库

Build Status

采集内容可存数据库,sqlite, mysql, pg

也可以存入文件,文章分文件夹 文章.txt resource文件夹

可以忽略某些html标签

Documentation

Overview

Copyright 2014 [email protected]. All rights reserved. Use of this source code is governed by a Apache License 2.0 that can be found in the LICENSE file.

Copyright 2014 [email protected]. All rights reserved. Use of this source code is governed by a Apache License 2.0 that can be found in the LICENSE file.

Copyright 2014 [email protected]. All rights reserved. Use of this source code is governed by a Apache License 2.0 that can be found in the LICENSE file.

Copyright 2014 [email protected]. All rights reserved. Use of this source code is governed by a Apache License 2.0 that can be found in the LICENSE file.

Index

Constants

This section is empty.

Variables

View Source
var (
	Engine *xorm.Engine
)
View Source
var UrlPattern = "" /* 152-byte string literal not displayed */

Functions

This section is empty.

Types

type AcqNode

type AcqNode struct {
	// contains filtered or unexported fields
}

func NewDefaultAcqNode

func NewDefaultAcqNode(nodeName string) *AcqNode

func (*AcqNode) AddListUrls

func (a *AcqNode) AddListUrls(urls ...string) (in []string)

添加采集列表

func (*AcqNode) AddListUrlsByTag

func (a *AcqNode) AddListUrlsByTag(urlMatch string, min, max, gap uint32) (in []string)

http://www.aiwutech.com/test/list_(*).html

func (*AcqNode) Exec

func (a *AcqNode) Exec()

func (*AcqNode) GetMatchMode

func (a *AcqNode) GetMatchMode() EMatchMode

func (*AcqNode) GetNodeName

func (a *AcqNode) GetNodeName() string

func (*AcqNode) GetTargetEncode

func (a *AcqNode) GetTargetEncode() ETargetEncodeType

func (*AcqNode) GetTargetListUrls

func (a *AcqNode) GetTargetListUrls() []string

采集列表url

func (*AcqNode) GetTargetUrlBeginHtml

func (a *AcqNode) GetTargetUrlBeginHtml() string

func (*AcqNode) GetTargetUrlEndHtml

func (a *AcqNode) GetTargetUrlEndHtml() string

func (*AcqNode) Len

func (a *AcqNode) Len() int

采集目标列表个数

func (*AcqNode) SetMatchMode

func (a *AcqNode) SetMatchMode(mode EMatchMode)

func (*AcqNode) SetNodeName

func (a *AcqNode) SetNodeName(nodeName string)

func (*AcqNode) SetTargetEncode

func (a *AcqNode) SetTargetEncode(encode ETargetEncodeType)

func (*AcqNode) SetTargetUrlBeginHtml

func (a *AcqNode) SetTargetUrlBeginHtml(html string)

func (*AcqNode) SetTargetUrlEndHtml

func (a *AcqNode) SetTargetUrlEndHtml(html string)

type AcqTarget

type AcqTarget struct {
	TargetUrl     string
	KeyWrodFilter string
	SummaryFilter string
	TitleRule     *OriginRule
	AuthorRule    *OriginRule
	FromRule      *OriginRule
	PostTimeRule  *OriginRule
	ContentRule   *OriginRule
}

type EMatchMode

type EMatchMode int32
const (
	Mode_Regex EMatchMode = iota
	Mode_String
)

type ETargetEncodeType

type ETargetEncodeType string
const (
	EncodeType_GB2312 ETargetEncodeType = "gb2312"
	EncodeType_UTF8   ETargetEncodeType = "utf8"
	EncodeType_BIG5   ETargetEncodeType = "big5"
)

type OriginRule

type OriginRule struct {
	Match  string
	Filter string
}

type TblAcqNode

type TblAcqNode struct {
	Id         int64
	NodeName   string `xorm:"unique"`
	NodeEncode string
	MatchMode  int32
	//	targetUrlMatch `xorm:"extends"`
	AcqCnt  int32
	Created time.Time `xorm:"created"`
	LastAcq time.Time `xomr:"updated"`
}

lasting for AcqNodes

type TblTarget

type TblTarget struct {
	Id        int64
	NodeId    int64
	ListId    int64
	TargetUrl string
	Keyword   string
	Summary   string
	Title     string
	From      string
	PostTime  string
	Content   string
	SavePath  string
	AcqCnt    int32
	Created   time.Time `xorm:"created"`
	LastAcq   time.Time `xorm:"updated"`
}

lasting for targets

type TblTargetList

type TblTargetList struct {
	Id      int64
	NodeId  int64
	ListUrl string
	AcqCnt  int32
	Created time.Time `xorm:"created"`
	LastAcq time.Time `xorm:"updated"`
}

lasting for lists

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL