types

package

v0.0.0-...-d69ec9c Latest Latest Go to latest Published: Aug 14, 2023 License: MIT Imports: 14 Imported by: 0

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/hktalent/katana

Links

Open Source Insights

Documentation ¶

Index ¶

type CrawlerOptions
- func NewCrawlerOptions(options *Options) (*CrawlerOptions, error)
type OnResultCallback
type Options
- func (options *Options) ParseCustomHeaders() map[string]string
- func (options *Options) ParseHeadlessOptionalArguments() map[string]string

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type CrawlerOptions ¶

type CrawlerOptions struct {
	// OutputWriter is the interface for writing output
	OutputWriter output.Writer
	// RateLimit is a mechanism for controlling request rate limit
	RateLimit ratelimit.Limiter
	// Options contains the user specified configuration options
	Options *Options
	// ExtensionsValidator is a validator for file extensions
	ExtensionsValidator *extensions.Validator
	// UniqueFilter is a filter for deduplication of unique items
	UniqueFilter filters.Filter
	// ScopeManager is a manager for validating crawling scope
	ScopeManager *scope.Manager
	// Dialer is instance of the dialer for global crawler
	Dialer *fastdialer.Dialer
	// Wappalyzer instance for technologies detection
	Wappalyzer *wappalyzer.Wappalyze
}

CrawlerOptions contains helper utilities for the crawler

func NewCrawlerOptions ¶

func NewCrawlerOptions(options *Options) (*CrawlerOptions, error)

NewCrawlerOptions creates a new crawler options structure from user specified options.

func (*CrawlerOptions) Close ¶

func (c *CrawlerOptions) Close() error

Close closes the crawler options resources

func (*CrawlerOptions) ValidatePath ¶

func (c *CrawlerOptions) ValidatePath(path string) bool

func (*CrawlerOptions) ValidateScope ¶

func (c *CrawlerOptions) ValidateScope(absURL, rootHostname string) (bool, error)

ValidateScope validates scope for an AbsURL

type OnResultCallback ¶

type OnResultCallback func(output.Result)

OnResultCallback (output.Result)

type Options ¶

type Options struct {
	// URLs contains a list of URLs for crawling
	URLs goflags.StringSlice
	// Scope contains a list of regexes for in-scope URLS
	Scope goflags.StringSlice
	// OutOfScope contains a list of regexes for out-scope URLS
	OutOfScope goflags.StringSlice
	// NoScope disables host based default scope
	NoScope bool
	// DisplayOutScope displays out of scope items in results
	DisplayOutScope bool
	// ExtensionsMatch contains extensions to match explicitly
	ExtensionsMatch goflags.StringSlice
	// ExtensionFilter contains additional items for filter list
	ExtensionFilter goflags.StringSlice
	// OutputMatchCondition is the condition to match output
	OutputMatchCondition string
	// OutputFilterCondition is the condition to filter output
	OutputFilterCondition string
	// MaxDepth is the maximum depth to crawl
	MaxDepth int
	// BodyReadSize is the maximum size of response body to read
	BodyReadSize int
	// Timeout is the time to wait for request in seconds
	Timeout int
	// CrawlDuration is the duration in seconds to crawl target from
	CrawlDuration time.Duration
	// Delay is the delay between each crawl requests in seconds
	Delay int
	// RateLimit is the maximum number of requests to send per second
	RateLimit int
	// Retries is the number of retries to do for request
	Retries int
	// RateLimitMinute is the maximum number of requests to send per minute
	RateLimitMinute int
	// Concurrency is the number of concurrent crawling goroutines
	Concurrency int
	// Parallelism is the number of urls processing goroutines
	Parallelism int
	// FormConfig is the path to the form configuration file
	FormConfig string
	// Proxy is the URL for the proxy server
	Proxy string
	// Strategy is the crawling strategy. depth-first or breadth-first
	Strategy string
	// FieldScope is the scope field for default DNS scope
	FieldScope string
	// OutputFile is the file to write output to
	OutputFile string
	// KnownFiles enables crawling of knows files like robots.txt, sitemap.xml, etc
	KnownFiles string
	// Fields is the fields to format in output
	Fields string
	// StoreFields is the fields to store in separate per-host files
	StoreFields string
	// FieldConfig is the path to the custom field configuration file
	FieldConfig string
	// NoColors disables coloring of response output
	NoColors bool
	// JSON enables writing output in JSON format
	JSON bool
	// Silent shows only output
	Silent bool
	// Verbose specifies showing verbose output
	Verbose bool
	// Version enables showing of crawler version
	Version bool
	// ScrapeJSResponses enables scraping of relative endpoints from javascript
	ScrapeJSResponses bool
	// ScrapeJSLuiceResponses enables scraping of endpoints from javascript using jsluice
	ScrapeJSLuiceResponses bool
	// CustomHeaders is a list of custom headers to add to request
	CustomHeaders goflags.StringSlice
	// Headless enables headless scraping
	Headless bool
	// AutomaticFormFill enables optional automatic form filling and submission
	AutomaticFormFill bool
	// FormExtraction enables extraction of form, input, textarea & select elements
	FormExtraction bool
	// UseInstalledChrome skips chrome install and use local instance
	UseInstalledChrome bool
	// ShowBrowser specifies whether the show the browser in headless mode
	ShowBrowser bool
	// HeadlessOptionalArguments specifies optional arguments to pass to Chrome
	HeadlessOptionalArguments goflags.StringSlice
	// HeadlessNoSandbox specifies if chrome should be start in --no-sandbox mode
	HeadlessNoSandbox bool
	// SystemChromePath : Specify the chrome binary path for headless crawling
	SystemChromePath string
	// ChromeWSUrl : Specify the Chrome debugger websocket url for a running Chrome instance to attach to
	ChromeWSUrl string
	// OnResult allows callback function on a result
	OnResult OnResultCallback
	// StoreResponse specifies if katana should store http requests/responses
	StoreResponse bool
	// StoreResponseDir specifies if katana should use a custom directory to store http requests/responses
	StoreResponseDir string
	// OmitRaw omits raw requests/responses from the output
	OmitRaw bool
	// OmitBody omits the response body from the output
	OmitBody bool
	// ChromeDataDir : 	Specify the --user-data-dir to chrome binary to preserve sessions
	ChromeDataDir string
	// HeadlessNoIncognito specifies if chrome should be started without incognito mode
	HeadlessNoIncognito bool
	// XhrExtraction extract xhr requests
	XhrExtraction bool
	// HealthCheck determines if a self-healthcheck should be performed
	HealthCheck bool
	// ErrorLogFile specifies a file to write with the errors of all requests
	ErrorLogFile string
	// Resolvers contains custom resolvers
	Resolvers goflags.StringSlice
	// OutputMatchRegex is the regex to match output url
	OutputMatchRegex goflags.StringSlice
	// OutputFilterRegex is the regex to filter output url
	OutputFilterRegex goflags.StringSlice
	// FilterRegex is the slice regex to filter url
	FilterRegex []*regexp.Regexp
	// MatchRegex is the slice regex to match url
	MatchRegex []*regexp.Regexp
	//DisableUpdateCheck disables automatic update check
	DisableUpdateCheck bool
	//IgnoreQueryParams ignore crawling same path with different query-param values
	IgnoreQueryParams bool
	// Debug
	Debug bool
	// TlsImpersonate enables experimental tls ClientHello randomization for standard crawler
	TlsImpersonate bool
}

func (*Options) ParseCustomHeaders ¶

func (options *Options) ParseCustomHeaders() map[string]string

func (*Options) ParseHeadlessOptionalArguments ¶

func (options *Options) ParseHeadlessOptionalArguments() map[string]string

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL