Skip to main content

Notice: this Wiki will be going read only early in 2024 and edits will no longer be possible. Please see: https://gitlab.eclipse.org/eclipsefdn/helpdesk/-/wikis/Wiki-shutdown-plan for the plan.

Jump to: navigation, search

SMILA/Documentation/Web Crawler

XML Index Order

Following is an example of a Webcrawler Index Order:

<?xml version="1.0" encoding="UTF-8"?>
<IndexOrderConfiguration xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
  <DataSourceID>Web_TEST</DataSourceID>
  <SchemaID>org.eclipse.eilf.connectivity.framework.crawler.web</SchemaID>
  <DataConnectionID>
    <Crawler>MyWebCrawler</Crawler>
  </DataConnectionID>
  <CompoundHandling>No</CompoundHandling>
  <Attributes>
    <Attribute Type="String" Name="Url" KeyAttribute="true">
      <FieldAttribute>Url</FieldAttribute>
    </Attribute>
    <Attribute Type="String" Name="Title">
      <FieldAttribute>Title</FieldAttribute>
    </Attribute>
    <Attribute Type="String" Name="Content" HashAttribute="true" Attachment="true" MimeTypeAttribute="Content">
      <FieldAttribute>Content</FieldAttribute>
    </Attribute>
    <Attribute Type="String" Name="MetaData" Attachment="false">
      <MetaAttribute Type="MetaData"/>
    </Attribute>
    <Attribute Type="String" Name="ResponseHeader" Attachment="false">
      <MetaAttribute Type="ResponseHeader">
        <MetaName>Date</MetaName>
        <MetaName>Server</MetaName>
      </MetaAttribute>
    </Attribute>
    <Attribute Type="String" Name="MetaDataWithResponseHeaderFallBack" Attachment="false">
      <MetaAttribute Type="MetaDataWithResponseHeaderFallBack"/>
    </Attribute>
  </Attributes>
  <Process>
    <WebSite ProjectName="Example Crawler Configuration" Header="Accept-Encoding: gzip,deflate; Via: myProxy" Referer="http://myReferer">
      <UserAgent Name="Crawler" Version="1.0" Description="Test crawler" Url="http://www.softaria.com" Email="crawler@example.com"/>
        <CrawlingModel Type="MaxIterations" Value="20"/>
        <CrawlScope Type="Broad">
          <Filters>
            <Filter Type="BeginningPath" WorkType="Select" Value="/test.html"/>
          </Filters>
        </CrawlScope>
        <CrawlLimits>
          <SizeLimits MaxBytesDownload="0" MaxDocumentDownload="10" MaxTimeSec="3600" MaxLengthBytes="1000000" />
          <TimeoutLimits Timeout="10000" />
          <WaitLimits Wait="0" RandomWait="false" MaxRetries="8" WaitRetry="0"/>
        </CrawlLimits>
        <Seeds FollowLinks="NoFollow">
          <Seed>http://www.brox.de</Seed>
        </Seeds>
        <Filters>
          <Filter Type="BeginningPath" WorkType="Unselect" Value="/something/">
            <Refinements>
              <TimeOfDay From="09:00:00" To="23:00:00"/>
              <Port Number="80"/>
            </Refinements>
          </Filter>
          <Filter Type="RegExp" WorkType="Unselect" Value="news"/>
          <Filter Type="ContentType" WorkType="Unselect" Value="image/jpeg"/>
        </Filters>
        <MetaTagFilters>
          <MetaTagFilter Type="Name" Name="author" Content="Blocked Author" WorkType="Unselect"/>
        </MetaTagFilters>
    </WebSite>
  </Process>
</IndexOrderConfiguration>

XSD Schema used for Web Crawler

<?xml version="1.0" encoding="utf-8"?>
<!-- Created with Liquid XML Studio 1.0.8.0 (http://www.liquid-technologies.com) -->
<xs:schema attributeFormDefault="unqualified" elementFormDefault="qualified" xmlns:xs="http://www.w3.org/2001/XMLSchema">
  <xs:redefine schemaLocation="RootIndexOrderConfiguration.xsd">
    <xs:complexType name="Attribute">
      <xs:annotation>
        <xs:documentation>Attribute Specification</xs:documentation>
      </xs:annotation>
      <xs:complexContent mixed="false">
        <xs:extension base="Attribute">
          <xs:choice>
            <xs:element name="FieldAttribute" type="FieldAttributeType" />
            <xs:element name="MetaAttribute" type="MetaAttributeType" />
          </xs:choice>
        </xs:extension>
      </xs:complexContent>
    </xs:complexType>
    <xs:complexType name="Process">
      <xs:annotation>
        <xs:documentation>Process Specification</xs:documentation>
      </xs:annotation>
      <xs:complexContent mixed="false">
        <xs:extension base="Process">
          <xs:sequence>
            <xs:element minOccurs="0" maxOccurs="unbounded" name="WebSite" type="WebSite" />
          </xs:sequence>
        </xs:extension>
      </xs:complexContent>
    </xs:complexType>
  </xs:redefine>
  <xs:simpleType name="CrawlScope">
    <xs:restriction base="xs:string">
      <xs:enumeration value="Broad" />
      <xs:enumeration value="Domain" />
      <xs:enumeration value="Host" />
      <xs:enumeration value="Path" />
    </xs:restriction>
  </xs:simpleType>
  <xs:simpleType name="FollowLinksType">
    <xs:restriction base="xs:string">
      <xs:enumeration value="Follow" />
      <xs:enumeration value="NoFollow" />
      <xs:enumeration value="FollowLinksWithCorrespondingSelectFilter" />
    </xs:restriction>
  </xs:simpleType>
  <xs:simpleType name="FilterType">
    <xs:restriction base="xs:string">
      <xs:enumeration value="BeginningPath" />
      <xs:enumeration value="RegExp" />
      <xs:enumeration value="ContentType" />
      <xs:enumeration value="CrawlScope" />
      <xs:enumeration value="HtmlMetaTag" />
    </xs:restriction>
  </xs:simpleType>
  <xs:simpleType name="FilterWorkType">
    <xs:restriction base="xs:string">
      <xs:enumeration value="Select" />
      <xs:enumeration value="Unselect" />
    </xs:restriction>
  </xs:simpleType>
  <xs:simpleType name="ModelType">
    <xs:restriction base="xs:string">
      <xs:enumeration value="MaxIterations" />
      <xs:enumeration value="MaxDepth" />
    </xs:restriction>
  </xs:simpleType>
  <xs:simpleType name="FieldAttributeType">
    <xs:restriction base="xs:string">
      <xs:enumeration value="Url" />
      <xs:enumeration value="Title" />
      <xs:enumeration value="Content" />
    </xs:restriction>
  </xs:simpleType>
  <xs:simpleType name="MetaType">
    <xs:restriction base="xs:string">
      <xs:enumeration value="MetaData" />
      <xs:enumeration value="ResponseHeader" />
      <xs:enumeration value="MetaDataWithResponseHeaderFallBack" />
    </xs:restriction>
  </xs:simpleType>
  <xs:complexType name="MetaAttributeType">
    <xs:sequence>
      <xs:element name="MetaName" type="xs:string" minOccurs="0" maxOccurs="unbounded"/>      
    </xs:sequence>
    <xs:attribute name="Type" type="MetaType" use="required" />
    <!-- xs:attribute name="MetaName" type="xs:string" use="optional" / -->
  </xs:complexType>
  <xs:simpleType name="Robotstxt">
    <xs:restriction base="xs:string">
      <xs:enumeration value="Classic" />
      <xs:enumeration value="Ignore" />
      <xs:enumeration value="Custom" />
      <xs:enumeration value="Set" />
    </xs:restriction>
  </xs:simpleType>
  <xs:simpleType name="HttpMethod">
    <xs:restriction base="xs:string">
      <xs:enumeration value="GET" />
      <xs:enumeration value="POST" />
    </xs:restriction>
  </xs:simpleType>
  <xs:simpleType name="HtmlMetaTagType">
    <xs:restriction base="xs:string">
      <xs:enumeration value="Name" />
      <xs:enumeration value="HttpEquiv" />
    </xs:restriction>
  </xs:simpleType>
  <xs:complexType name="WebSite">
    <xs:sequence>
      <xs:element minOccurs="0" name="UserAgent">
        <xs:complexType>
          <xs:attribute name="Name" type="xs:string" use="required" />
          <xs:attribute name="Version" type="xs:string" use="optional" />
          <xs:attribute name="Description" type="xs:string" use="optional" />
          <xs:attribute name="Url" type="xs:string" use="optional" />
          <xs:attribute name="Email" type="xs:string" use="optional" />
        </xs:complexType>
      </xs:element>
      <xs:element minOccurs="0" name="Robotstxt">
        <xs:complexType>
          <xs:attribute default="Classic" name="Policy" type="Robotstxt" use="optional" />
          <xs:attribute default="" name="Value" type="xs:string" use="optional" />
          <xs:attribute default="" name="AgentNames" type="xs:string" use="optional" />
        </xs:complexType>
      </xs:element>
      <xs:element minOccurs="0" name="CrawlingModel">
        <xs:complexType>
          <xs:attribute name="Type" type="ModelType" use="required" />
          <xs:attribute name="Value" type="xs:positiveInteger" use="required" />
        </xs:complexType>
      </xs:element>
      <xs:element minOccurs="0" name="CrawlScope">
        <xs:complexType>
          <xs:sequence>
            <xs:element minOccurs="0" name="Filters">
              <xs:complexType>
                <xs:sequence>
                  <xs:element maxOccurs="unbounded" name="Filter">
                    <xs:complexType>
                      <xs:complexContent mixed="false">
                        <xs:extension base="Filter" />
                      </xs:complexContent>
                    </xs:complexType>
                  </xs:element>
                </xs:sequence>
              </xs:complexType>
            </xs:element>
          </xs:sequence>
          <xs:attribute default="Host" name="Type" type="CrawlScope" use="optional" />
        </xs:complexType>
      </xs:element>
      <xs:element minOccurs="0" name="CrawlLimits">
        <xs:complexType>
          <xs:sequence>
            <xs:element minOccurs="0" name="SizeLimits">
              <xs:complexType>
                <xs:attribute default="0" name="MaxBytesDownload" type="xs:integer" use="optional" />
                <xs:attribute default="0" name="MaxDocumentDownload" type="xs:integer" use="optional" />
                <xs:attribute default="0" name="MaxTimeSec" type="xs:integer" use="optional" />
                <xs:attribute default="0" name="MaxLengthBytes" type="xs:integer" use="optional" />
                <xs:attribute default="0" name="LimitRate" type="xs:integer" use="optional" />
              </xs:complexType>
            </xs:element>
            <xs:element minOccurs="0" name="TimeoutLimits">
              <xs:complexType>
                <xs:attribute default="0" name="Timeout" type="xs:integer" use="optional" />
                <xs:attribute default="0" name="DnsTimeout" type="xs:integer" use="optional" />
                <xs:attribute default="0" name="ConnectTimeout" type="xs:integer" use="optional" />
                <xs:attribute default="900" name="ReadTimeout" type="xs:integer" use="optional" />
              </xs:complexType>
            </xs:element>
            <xs:element minOccurs="0" name="WaitLimits">
              <xs:complexType>
                <xs:attribute default="0" name="Wait" type="xs:integer" use="optional" />
                <xs:attribute default="0" name="RandomWait" type="xs:boolean" use="optional" />
                <xs:attribute default="0" name="WaitRetry" type="xs:integer" use="optional" />
                <xs:attribute default="0" name="MaxRetries" type="xs:integer" use="optional" />
              </xs:complexType>
            </xs:element>
          </xs:sequence>
        </xs:complexType>
      </xs:element>
      <xs:element minOccurs="0" name="Proxy">
        <xs:complexType>
          <xs:choice>
            <xs:element name="ProxyServer">
              <xs:complexType>
                <xs:attribute name="Host" type="xs:string" use="required" />
                <xs:attribute name="Port" type="xs:string" use="required" />
                <xs:attribute default="" name="Login" type="xs:string" use="optional" />
                <xs:attribute default="" name="Password" type="xs:string" use="optional" />
              </xs:complexType>
            </xs:element>
            <xs:element name="AutomaticConfiguration">
              <xs:complexType>
                <xs:attribute name="Address" type="xs:string" use="required" />
              </xs:complexType>
            </xs:element>
          </xs:choice>
        </xs:complexType>
      </xs:element>
      <xs:element minOccurs="0" name="Authentication">
        <xs:complexType>
          <xs:sequence>
            <xs:element minOccurs="0" maxOccurs="unbounded" name="Rfc2617">
              <xs:complexType>
                <xs:attribute name="Host" type="xs:string" use="required" />
                <xs:attribute name="Port" type="xs:string" use="required" />
                <xs:attribute name="Realm" type="xs:string" use="required" />
                <xs:attribute name="Login" type="xs:string" use="required" />
                <xs:attribute name="Password" type="xs:string" use="required" />
              </xs:complexType>
            </xs:element>
            <xs:element minOccurs="0" maxOccurs="unbounded" name="HtmlForm">
              <xs:complexType>
                <xs:sequence>
                  <xs:element name="FormElements">
                    <xs:complexType>
                      <xs:sequence>
                        <xs:element maxOccurs="unbounded" name="FormElement">
                          <xs:complexType>
                            <xs:attribute name="Key" type="xs:string" use="required" />
                            <xs:attribute name="Value" type="xs:string" use="required" />
                          </xs:complexType>
                        </xs:element>
                      </xs:sequence>
                    </xs:complexType>
                  </xs:element>
                </xs:sequence>
                <xs:attribute name="CredentialDomain" type="xs:string" use="required" />
                <xs:attribute name="LoginUri" type="xs:string" use="required" />
                <xs:attribute name="HttpMethod" type="HttpMethod" use="required" />
              </xs:complexType>
            </xs:element>
            <xs:element minOccurs="0" maxOccurs="unbounded" name="SslCertificate">
              <xs:complexType>
                <xs:attribute name="ProtocolName" type="xs:string" use="required" />
                <xs:attribute name="Port" type="xs:string" use="required" />
                <xs:attribute name="TruststoreUrl" type="xs:string" use="required" />
                <xs:attribute default="" name="TruststorePassword" type="xs:string" use="optional" />
                <xs:attribute name="KeystoreUrl" type="xs:string" use="required" />
                <xs:attribute default="" name="KeystorePassword" type="xs:string" use="optional" />
              </xs:complexType>
            </xs:element>
          </xs:sequence>
        </xs:complexType>
      </xs:element>
      <xs:element minOccurs="0" name="Ssl">
        <xs:complexType>
          <xs:attribute name="TruststoreUrl" type="xs:string" use="required" />
          <xs:attribute default="" name="TruststorePassword" type="xs:string" use="optional" />
        </xs:complexType>
      </xs:element>
      <xs:element name="Seeds">
        <xs:complexType>
          <xs:sequence>
            <xs:element maxOccurs="unbounded" name="Seed" type="xs:string" />
          </xs:sequence>
          <xs:attribute default="Follow" name="FollowLinks" type="FollowLinksType" use="optional" />
        </xs:complexType>
      </xs:element>
      <xs:element minOccurs="0" name="Filters">
        <xs:complexType>
          <xs:sequence>
            <xs:element maxOccurs="unbounded" name="Filter">
              <xs:complexType>
                <xs:complexContent mixed="false">
                  <xs:extension base="Filter">
                    <xs:sequence>
                      <xs:element minOccurs="0" name="Refinements">
                        <xs:complexType>
                          <xs:sequence>
                            <xs:element minOccurs="0" name="TimeOfDay">
                              <xs:complexType>
                                <xs:attribute name="From" type="xs:time" use="required" />
                                <xs:attribute name="To" type="xs:time" use="required" />
                              </xs:complexType>
                            </xs:element>
                            <xs:element minOccurs="0" name="Port">
                              <xs:complexType>
                                <xs:attribute name="Number" type="xs:integer" use="required" />
                              </xs:complexType>
                            </xs:element>
                          </xs:sequence>
                        </xs:complexType>
                      </xs:element>
                    </xs:sequence>
                  </xs:extension>
                </xs:complexContent>
              </xs:complexType>
            </xs:element>
          </xs:sequence>
        </xs:complexType>
      </xs:element>
      <xs:element minOccurs="0" name="MetaTagFilters">
        <xs:complexType>
          <xs:sequence>
            <xs:element maxOccurs="unbounded" name="MetaTagFilter">
              <xs:complexType>
                <xs:attribute name="Type" type="HtmlMetaTagType" use="required" />
                <xs:attribute name="Name" type="xs:string" use="required" />
                <xs:attribute name="Content" type="xs:string" use="required" />
                <xs:attribute name="WorkType" type="FilterWorkType" use="required" />
              </xs:complexType>
            </xs:element>
          </xs:sequence>
        </xs:complexType>
      </xs:element>
    </xs:sequence>
    <xs:attribute name="ProjectName" type="xs:string" use="required" />
    <xs:attribute default="false" name="Sitemaps" type="xs:boolean" use="optional" />
    <xs:attribute default="" name="Header" type="xs:string" use="optional" />
    <xs:attribute default="" name="Referer" type="xs:string" use="optional" />
    <xs:attribute default="true" name="EnableCookies" type="xs:boolean" use="optional" />
  </xs:complexType>
  <xs:complexType name="Filter">
    <xs:attribute name="WorkType" type="FilterWorkType" use="required" />
    <xs:attribute name="Value" type="xs:string" use="required" />
    <xs:attribute name="Type" type="FilterType" use="required" />
  </xs:complexType>
</xs:schema>

Attribute element

Field Attributes. The FiledAttribute element describes the web page information that should be included in the index. Following options exists:

  1. URL: the URL to the web page.
  2. Title: the web page's title is exported.
  3. Content: the content of the web page is emitted as byte[].
<Attribute Type="String" Name="Url" KeyAttribute="true">
  <FieldAttribute>Url</FieldAttribute>
</Attribute>
<Attribute Type="String" Name="Title">
  <FieldAttribute>Title</FieldAttribute>
</Attribute>
<Attribute Type="String" Name="Content" HashAttribute="true" Attachment="true" MimeTypeAttribute="Content">
  <FieldAttribute>Content</FieldAttribute>
</Attribute>

Metadata Attributes. MetaAttribute element describes meta information that should be included in the index like HTML Metadata and HTTP response header.

  1. MetaData
    The MetaData element describes the meta-tag information that should be included in the index. Well known meta-tags are for example:
    description
    keywords
    ...
  2. ResponseHeader
    The ResponseHeader element describes the response header information that should be included in the index. Well known response headers are for #: #: example:
    accept-ranges
    server
    location
    ...
  3. MetaDataWithResponseHeaderFallBack
    The MetaDataWithResponseHeaderFallBack element describes meta-tag or response header information that should be included in the index.
<Attribute Type="String" Name="MetaData" Attachment="false">
  <MetaAttribute Type="MetaData"/>
</Attribute>
<Attribute Type="String" Name="ResponseHeader" Attachment="false">
  <MetaAttribute Type="ResponseHeader">
    <MetaName>Date</MetaName>
    <MetaName>Server</MetaName>
  </MetaAttribute>
</Attribute>
<Attribute Type="String" Name="MetaDataWithResponseHeaderFallBack" Attachment="false">
  <MetaAttribute Type="MetaDataWithResponseHeaderFallBack"/>
</Attribute>

Process element

The Process element is responsible for selecting data. The schema definition of the process element and its subelements look like the following picture:

WebCrawler-Process.gif

Crawling configurations are defined for each website to be crawled separately. Crawling order is the same as WebSite elements order. Only Seeds element in the WebSite configuration is required to start crawling.

WebSite

The WebSite element contains all important information for accessing and crawling web site. The list of available attributes are:

  • ProjectName: defines project name.
  • Sitemaps: for supporting Google site maps. sitemap.xml, sitemap.xml.gz and sitemap.gz formats are supported. Links extracted from <loc> tags are added to the current level links. Crawler looks for the sitemap file at the root directory of the web server and then caches it for the particular host.
  • Header: request headers separated by semicolon. Headers should be in format "<header_name>:<header_content>", separated by semicolon.
  • Referer: include 'Referer: URL' header in HTTP request.
  • EnableCookies: enable or disable cookies for crawling process.

UserAgent

UserAgent element is used to identify crawler to the server as a specific user agent originating the request.

  • Name: agent name, the only required attribute.
  • Version
  • Description
  • Url
  • Email

The generated User-Agent string looks like following: Name/Version (Description, Url, Email).

Robotstxt

Robotstxt element is used for supporting robots.txt information.

  • Policy: there are five types of policies offered on how to deal with robots.txt rules:
 1. Classic
    Simply obey the robots.txt rules. Recommended unless you have special permission to collect a site more aggressively.
 2. Ignore
    Completely ignore robots.txt rules.
 3. Custom
    Obey user set, custom, robots.txt rules instead of those discovered on the relevant site. The attribute Value must handle path to custom robots.txt file in this case.
 4. Set
    Limit robots names which rules are followed to the given set. Value attribute must handle robots names separated by semicolon in this case.
  • Value: specifies the filename with the robots.txt rules for Custom policy and set of agent names for the Set policy.
  • AgentNames: specifies the list of agents we advertise. This list should be started with the same name as UserAgent Name (ie. crawler user-agent name that is used for the crawl job).

CrawlingModel

Two crawling models available:

 1. Max iterations: crawling a web site through a limited number of links.
 2. Max depth: crawling a web site with specifying the maximum crawling depth.
  • Type: the model type, "MaxIterations" or "MaxDepth".
  • Value; parameter (integer value).

CrawlScope

A crawl scope decides for each discovered URI if it is within the scope of the current crawl.

  • Type: following scopes are provided:
    • Broad : accept all
      This scope does not impose any limits on the hosts, domains, or URI paths crawled.
    • Domain: accept if on same 'domain' (for some definition) as seeds
      This scope limits discovered URIs to the set of domains defined by the provided seeds. That is any URI discovered belonging to a domain from which one of the seed came is within scope. Using the seed 'brox.de', a domain scope will fetch 'bugs.brox.de', 'confluence.brox.de', etc. It will fetch all discovered URIs from 'brox.de' and from any subdomain of 'brox.de'.
    • Host: accept if on exact host as seeds
      This scope limits discovered URIs to the set of hosts defined by the provided seeds.
      If the seed is 'www.brox.de', then we'll only fetch items discovered on this host. The crawler will not go to 'bugs.brox.de'.
    • Path: accept if on same host and a shared path-prefix as seeds
      This scope goes yet further and limits the discovered URIs to a section of paths on hosts defined by the seeds. Of course any host that has a seed **:pointing at its root (i.e. www.sample.com/index.html) will be included in full where as a host whose only seed is www.sample2.com/path/index.html **:will be limited to URIs under /path/.

Every scope can have additional filters to select URIs that will be considered to be within or out of scope (see the section Filters for details). For example:

<CrawlScope Type="Broad">
    <Filters>
	    <Filter Type="BeginningPath" WorkType="Select" Value="/level3.html"/>
    </Filters>
</CrawlScope>

CrawlLimits

SizeLimits

In addition to limits imposed on the scope of the crawl it is possible to enforce arbitrary limits on the duration and extent of the crawl with the following settings:

  • MaxBytesDownload
    Stop after a fixed number of bytes have been downloaded. 0 means unlimited.
  • MaxDocumentDownload
    Stop after downloading a fixed number of documents. 0 means unlimited.
  • MaxTimeSec
    Stop after a certain number of seconds have elapsed. 0 means unlimited.

These are not supposed to be hard limits. Once one of these limits is hit it will trigger a graceful termination of the crawl job, which means that URIs already being crawled will be completed. As a result the set limit will be exceeded by some amount.

  • MaxLengthBytes
    Maximum number of bytes to download per document. Will truncate file once this limit is reached.
TimeoutLimits
  • Timeout
    Network timeout in seconds. This is equivalent to specifying ConnectTimeout and ReadTimeout, all at the same time. Whenever crawler connects to or *:reads from a remote host, it checks for a timeout and aborts the operation if the time expires. This prevents anomalous occurrences such as hanging *:reads or infinite connects.
  • ConnectTimeout
    Connect timeout in seconds. TCP connections that take longer to establish will be aborted.
  • ReadTimeout
    Read (and write) timeout in seconds. Reads that take longer will fail. The default value for read timeout is 900 seconds.
WaitLimits

The following rules impose additional wait time between the end of processing one URI and until the next one starts.

  • Wait
    Wait the specified number of seconds between the retrievals. Use of this option is recommended, as it lightens the server load by making the *:requests less frequent. Specifying a large value for this option is useful if the network or the destination host is down, so that crawler can wait *:long enough to reasonably expect the network error to be fixed before the retry.
  • RandomWait
    Some web sites may perform log analysis to identify retrieval programs by looking for statistically significant similarities in the time between requests. This option causes the time between requests to vary between 0 and 2 * wait seconds, where wait was specified using the wait setting, in order to mask crawler's presence from such analysis.

A policy on retrying URIs that encountered errors that usually are transitory (socket timeouts etc.) can be defined with the following settings:

  • MaxRetries
    How often to retry URIs that encounter possible transient errors.
  • WaitRetry
    How long to wait between such retries.

Proxy The Proxy element specifies HTTP proxy server to be used.

ProxyServer

The list of available attributes:

  • Host: proxy host.
  • Port: proxy port.
  • Login: proxy login.
  • Password: proxy password.
Authentication

The Authentication element is used to gain access to areas of websites requiring authentication. Three types of authentication are available: RFC2617 (BASIC and DIGEST types of authentication), HTTP POST or GET of an HTML Form and SSL Certificate based client authentication. 1. RFC2617 (BASIC and DIGEST types of authentication) To use this type of authentication you need to supply Host, Port, Realm, Login, and Password.

  • Host and
  • Port: these two attributes equate to the canonical root URI of RFC2617.
  • Realm: realm as per RFC2617. The realm string must match exactly the realm name presented in the authentication challenge served up by the web server.
  • Login: username used for login
  • Password: password to this restricted area

2. HTML Form POST or GET To use this type of authentication you need to supply CredentialDomain, HttpMethod, LoginUri, and FormItems.

  • CredentialDomain: same as the RFC2617 canonical root URI of RFC2617.
  • HttpMethod: POST or GET.
  • LoginUrl: relative or absolute URI to the page that the HTML Form submits to (Not the page that contains the HTML Form).
  • FormItems: listing of HTML Form key/value pairs.

3. SSL Certificate based client authentication Supply ProtocolName, Port, TruststoreUrl, TruststorePassword, KeystoreUrl and KeystorePassword.

  • ProtocolName: name of the protocol to be used, e.g. "https"
  • Port: port number
  • TruststoreUrl: location of the file containing one or several trusted certificates
  • TruststorePassword
  • KeystoreUrl: location of the file containing a private key/public certificate pair
  • KeystorePassword

Seeds

The element Seeds contains a list of Seed elements. The element Seed defines site's start path from which indexing process will be initiated.

  • FollowLinks: enables analyzing URL of pages with non fitting URLs. Following options are available:
    • NoFollow: do not analyze anything that mathes some "Unselect" filter
    • Follow: analyze everything that matches some "Unselect" filter, do not index anything
    • FollowLinksWithCorrespondingSelectFilter: index pages that match both "Select" and "Unselect" filters, and analyze everything else that matches **:some "Unselect" filter

Filters

The element Filters contains a list of Filter elements with optional Refinements element.

Filter

The Filter element is used to define filters for pages that should be crawled and indexed.

  • Type
    The following filter's types is available:
    • BeginningPath: filters paths which begin with the specified characters.
    • RegExp: filters urls based on a regular expression.
    • ContentType: filters content type on a regular expression. Use this filter to abort the download of content-types other than those wanted
  • WorkType: Select or Unselect, the way how filter should work.
  • Value: the filter value that will be used to check if the given value matches the filter or not.

Refinements

Refinements element must be nested into the Filter element. It allows to modify filter settings under certain circumstances. Following refinements may be applied to the filters:

Port element: match only those URIs for the given port number.

  • Number: port number.

TimeOfDay element: if this refinement is applied filter will be in effect between the hours specified each day.

  • From: time when filter becomes enabled.
  • To: till this time the filter will be enabled.

From and To attributes must be in HH:mm:ss format (e.g. 23:00:00)

MetaTagFilters

The element MetaTagFilters contains a list of MetaTagFilter elements.

MetaTagFilter:

The element MetaTagFilter defines filter for omitting content by meta tags.

  • Type: type of meta tag to match: Name or Http-Equiv.
  • Name: name of the tag e.g. "author" for the Type "Name".
  • Content: the tag contents.
  • WorkType: Select or Unselect.

Configuration examples

Here are some examples of the Process element configuration

1. Minimal configuration example. This example demonstrates minimal configuration required for crawler.

<WebSite ProjectName="Minimal Configuration">
  <Seeds>
    <Seed>http://localhost/test/</Seed>
  </Seeds>
</WebSite>

2. Html form login example This example demonstrates how to login to Invision Power Board powered forum. Number of downloaded pages is limited to 15. robots.txt information is ignored. Crawler will advertise itself as Mozilla/5.0.

<WebSite ProjectName="Login To Invision Powerboard Forum Example">
  <UserAgent Name="Mozilla" Version="5.0" Description="" Url="" Email=""/>
    <Robotstxt Policy="Ignore" />
      <CrawlLimits>
	<SizeLimits MaxDocumentDownload="15"/>
      </CrawlLimits>
  <Authentication>
    <HtmlForm CredentialDomain="http://forum.example.com/index.php?act=Login&amp;CODE=00" LoginUri="http://forum.example.com/index.php?act=Login&amp;CODE=01" HttpMethod="POST">
      <FormElements>
        <FormElement Key="referer" Value=""/>
      	<FormElement Key="CookieDate" Value="1"/>
      	<FormElement Key="Privacy" Value="1"/>
      	<FormElement Key="UserName" Value="User"/>
      	<FormElement Key="PassWord" Value="Password"/>
      	<FormElement Key="submit" Value="Enter"/>
      </FormElements>
    </HtmlForm>
  </Authentication>
  <Seeds FollowLinks="Follow">
    <Seed><![CDATA[http://forum.example.com/index.php?act=Login&CODE=00]]></Seed>
  </Seeds>
</WebSite>

3. Multiple websites configuration

<WebSite ProjectName="First WebSite">
  <UserAgent Name="Brox Crawler" Version="1.0" Description="Brox Crawler" Url="http://www.example.com" Email="crawler@example.com"/>
	<CrawlingModel Type="MaxIterations" Value="20"/>
	<CrawlScope Type="Broad">  
	<CrawlLimits>
	  <SizeLimits MaxBytesDownload="0" MaxDocumentDownload="100" MaxTimeSec="3600" MaxLengthBytes="1000000" />
	  <TimeoutLimits Timeout="10000" />
	  <WaitLimits Wait="0" RandomWait="false" MaxRetries="8" WaitRetry="0"/>
	</CrawlLimits>
  <Seeds FollowLinks="Follow">
	  <Seed>http://localhost/</Seed>
	  <Seed>http://localhost/otherseed</Seed>
  </Seeds>
	<Authentication>
	  <Rfc2617 Host="localhost" Port="80" Realm="Restricted area" Login="user" Password="pass"/>				  				  				  
	  <HtmlForm CredentialDomain="http://localhost:8081/admin/" LoginUri="/j_security_check" HttpMethod="GET">
      <FormElements>
	      <FormElement Key="j_username" Value="admin"/>
	      <FormElement Key="j_password" Value=""/>
	      <FormElement Key="submit" Value="Login"/>
	    </FormElements>
	  </HtmlForm>
	</Authentication>
</WebSite>
<WebSite ProjectName="Second WebSite">
  <UserAgent Name="Mozilla" Version="5.0" Description="X11; U; Linux x86_64; en-US; rv:1.8.1.4" />
	<Robotstxt Policy="Classic" AgentNames="mozilla, googlebot"/>
	<CrawlingModel Type="MaxDepth" Value="100"/>
	<CrawlScope Type="Host"/>
	<CrawlLimits>
	  <WaitLimits Wait="5" RandomWait="true"/>
	</CrawlLimits>
	<Seeds FollowLinks="NoFollow">
		<Seed>http://example.com</Seed>
	</Seeds>
	<Filters>
		<Filter Type="BeginningPath" WorkType="Unselect" Value="/something/">
			<Refinements>
				<TimeOfDay From="09:00:00" To="23:00:00"/>
				<Port Number="80"/>
			</Refinements>
		</Filter>
		<Filter Type="RegExp" WorkType="Unselect" Value="news"/>
		<Filter Type="ContentType" WorkType="Unselect" Value="image/jpeg"/>
	</Filters>
</WebSite>

4. Complex website configuration example

<WebSite ProjectName="Example Crawler Configuration" Header="Accept-Encoding: gzip,deflate; Via: myProxy" Referer="http://myReferer">
  <UserAgent Name="Crawler" Version="1.0" Description="Test crawler" Url="http://www.example.com" Email="crawler@example.com"/>
    <Robotstxt Policy="Custom" Value="/home/user/customRobotRules.txt" AgentNames="agent1;agent2"/>
    <CrawlingModel Type="MaxIterations" Value="20"/>
    <CrawlScope Type="Broad">
      <Filters>
        <Filter Type="BeginningPath" WorkType="Select" Value="/test.html"/>
      </Filters>
    </CrawlScope>
    <CrawlLimits>
      <SizeLimits MaxBytesDownload="0" MaxDocumentDownload="1" MaxTimeSec="3600" MaxLengthBytes="1000000" />
      <TimeoutLimits Timeout="10000" />
      <WaitLimits Wait="0" RandomWait="false" MaxRetries="8" WaitRetry="0"/>
    </CrawlLimits>
    <Proxy>
      <ProxyServer Host="example.com" Port="3128" Login="user" Password="pass"/>
    </Proxy>
    <Authentication>
      <Rfc2617 Host="somehost.com" Port="80" Realm="realm string" Login="user" Password="pass"/>
    </Authentication>
    <Seeds FollowLinks="NoFollow">
      <Seed>http://example.com</Seed>
    </Seeds>
    <Filters>
      <Filter Type="BeginningPath" WorkType="Unselect" Value="/something/">
        <Refinements>
          <TimeOfDay From="09:00:00" To="23:00:00"/>
          <Port Number="80"/>
        </Refinements>
      </Filter>
      <Filter Type="RegExp" WorkType="Unselect" Value="news"/>
      <Filter Type="ContentType" WorkType="Unselect" Value="image/jpeg"/>
    </Filters>
    <MetaTagFilters>
      <MetaTagFilter Type="Name" Name="author" Content="Blocked Author" WorkType="Unselect"/>
    </MetaTagFilters>
</WebSite>

Back to the top