Notice: This Wiki is now read only and edits are no longer possible. Please see: https://gitlab.eclipse.org/eclipsefdn/helpdesk/-/wikis/Wiki-shutdown-plan for the plan.
SMILA/Documentation/Web Crawler
XML Index Order
Following an example of a Webcrawler Index Order:
<?xml version="1.0" encoding="UTF-8"?> <IndexOrderConfiguration xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> <DataSourceID>Web_TEST</DataSourceID> <SchemaID>org.eclipse.eilf.connectivity.framework.crawler.web</SchemaID> <DataConnectionID> <Crawler>MyWebCrawler</Crawler> </DataConnectionID> <CompoundHandling>No</CompoundHandling> <Attributes> <Attribute Type="String" Name="Url" KeyAttribute="true"> <FieldAttribute>Url</FieldAttribute> </Attribute> <Attribute Type="String" Name="Title"> <FieldAttribute>Title</FieldAttribute> </Attribute> <Attribute Type="String" Name="Content" HashAttribute="true" Attachment="true" MimeTypeAttribute="Content"> <FieldAttribute>Content</FieldAttribute> </Attribute> <Attribute Type="String" Name="MetaData" Attachment="false"> <MetaAttribute Type="MetaData"/> </Attribute> <Attribute Type="String" Name="ResponseHeader" Attachment="false"> <MetaAttribute Type="ResponseHeader"> <MetaName>Date</MetaName> <MetaName>Server</MetaName> </MetaAttribute> </Attribute> <Attribute Type="String" Name="MetaDataWithResponseHeaderFallBack" Attachment="false"> <MetaAttribute Type="MetaDataWithResponseHeaderFallBack"/> </Attribute> </Attributes> <Process> <WebSite ProjectName="Example Crawler Configuration" Header="Accept-Encoding: gzip,deflate; Via: myProxy" Referer="http://myReferer"> <UserAgent Name="Crawler" Version="1.0" Description="Test crawler" Url="http://www.softaria.com" Email="crawler@example.com"/> <CrawlingModel Type="MaxIterations" Value="20"/> <CrawlScope Type="Broad"> <Filters> <Filter Type="BeginningPath" WorkType="Select" Value="/test.html"/> </Filters> </CrawlScope> <CrawlLimits> <SizeLimits MaxBytesDownload="0" MaxDocumentDownload="10" MaxTimeSec="3600" MaxLengthBytes="1000000" /> <TimeoutLimits Timeout="10000" /> <WaitLimits Wait="0" RandomWait="false" MaxRetries="8" WaitRetry="0"/> </CrawlLimits> <Seeds FollowLinks="NoFollow"> <Seed>http://www.brox.de</Seed> </Seeds> <Filters> <Filter Type="BeginningPath" WorkType="Unselect" Value="/something/"> <Refinements> <TimeOfDay From="09:00:00" To="23:00:00"/> <Port Number="80"/> </Refinements> </Filter> <Filter Type="RegExp" WorkType="Unselect" Value="news"/> <Filter Type="ContentType" WorkType="Unselect" Value="image/jpeg"/> </Filters> <MetaTagFilters> <MetaTagFilter Type="Name" Name="author" Content="Blocked Author" WorkType="Unselect"/> </MetaTagFilters> </WebSite> </Process> </IndexOrderConfiguration>
XSD Schema used for Web Crawler
<?xml version="1.0" encoding="utf-8"?> <!-- Created with Liquid XML Studio 1.0.8.0 (http://www.liquid-technologies.com) --> <xs:schema attributeFormDefault="unqualified" elementFormDefault="qualified" xmlns:xs="http://www.w3.org/2001/XMLSchema"> <xs:redefine schemaLocation="RootIndexOrderConfiguration.xsd"> <xs:complexType name="Attribute"> <xs:annotation> <xs:documentation>Attribute Specification</xs:documentation> </xs:annotation> <xs:complexContent mixed="false"> <xs:extension base="Attribute"> <xs:choice> <xs:element name="FieldAttribute" type="FieldAttributeType" /> <xs:element name="MetaAttribute" type="MetaAttributeType" /> </xs:choice> </xs:extension> </xs:complexContent> </xs:complexType> <xs:complexType name="Process"> <xs:annotation> <xs:documentation>Process Specification</xs:documentation> </xs:annotation> <xs:complexContent mixed="false"> <xs:extension base="Process"> <xs:sequence> <xs:element minOccurs="0" maxOccurs="unbounded" name="WebSite" type="WebSite" /> </xs:sequence> </xs:extension> </xs:complexContent> </xs:complexType> </xs:redefine> <xs:simpleType name="CrawlScope"> <xs:restriction base="xs:string"> <xs:enumeration value="Broad" /> <xs:enumeration value="Domain" /> <xs:enumeration value="Host" /> <xs:enumeration value="Path" /> </xs:restriction> </xs:simpleType> <xs:simpleType name="FollowLinksType"> <xs:restriction base="xs:string"> <xs:enumeration value="Follow" /> <xs:enumeration value="NoFollow" /> <xs:enumeration value="FollowLinksWithCorrespondingSelectFilter" /> </xs:restriction> </xs:simpleType> <xs:simpleType name="FilterType"> <xs:restriction base="xs:string"> <xs:enumeration value="BeginningPath" /> <xs:enumeration value="RegExp" /> <xs:enumeration value="ContentType" /> <xs:enumeration value="CrawlScope" /> <xs:enumeration value="HtmlMetaTag" /> </xs:restriction> </xs:simpleType> <xs:simpleType name="FilterWorkType"> <xs:restriction base="xs:string"> <xs:enumeration value="Select" /> <xs:enumeration value="Unselect" /> </xs:restriction> </xs:simpleType> <xs:simpleType name="ModelType"> <xs:restriction base="xs:string"> <xs:enumeration value="MaxIterations" /> <xs:enumeration value="MaxDepth" /> </xs:restriction> </xs:simpleType> <xs:simpleType name="FieldAttributeType"> <xs:restriction base="xs:string"> <xs:enumeration value="Url" /> <xs:enumeration value="Title" /> <xs:enumeration value="Content" /> </xs:restriction> </xs:simpleType> <!-- xs:simpleType name="MetaAttributeType" --> <xs:simpleType name="MetaType"> <xs:restriction base="xs:string"> <xs:enumeration value="MetaData" /> <xs:enumeration value="ResponseHeader" /> <xs:enumeration value="MetaDataWithResponseHeaderFallBack" /> </xs:restriction> </xs:simpleType> <xs:complexType name="MetaAttributeType"> <xs:sequence> <xs:element name="MetaName" type="xs:string" minOccurs="0" maxOccurs="unbounded"/> </xs:sequence> <xs:attribute name="Type" type="MetaType" use="required" /> <!-- xs:attribute name="MetaName" type="xs:string" use="optional" / --> </xs:complexType> <xs:simpleType name="Robotstxt"> <xs:restriction base="xs:string"> <xs:enumeration value="Classic" /> <xs:enumeration value="Ignore" /> <xs:enumeration value="Custom" /> <xs:enumeration value="Set" /> </xs:restriction> </xs:simpleType> <xs:simpleType name="HttpMethod"> <xs:restriction base="xs:string"> <xs:enumeration value="GET" /> <xs:enumeration value="POST" /> </xs:restriction> </xs:simpleType> <xs:simpleType name="HtmlMetaTagType"> <xs:restriction base="xs:string"> <xs:enumeration value="Name" /> <xs:enumeration value="HttpEquiv" /> </xs:restriction> </xs:simpleType> <xs:complexType name="WebSite"> <xs:sequence> <xs:element minOccurs="0" name="UserAgent"> <xs:complexType> <xs:attribute name="Name" type="xs:string" use="required" /> <xs:attribute name="Version" type="xs:string" use="optional" /> <xs:attribute name="Description" type="xs:string" use="optional" /> <xs:attribute name="Url" type="xs:string" use="optional" /> <xs:attribute name="Email" type="xs:string" use="optional" /> </xs:complexType> </xs:element> <xs:element minOccurs="0" name="Robotstxt"> <xs:complexType> <xs:attribute default="Classic" name="Policy" type="Robotstxt" use="optional" /> <xs:attribute default="" name="Value" type="xs:string" use="optional" /> <xs:attribute default="" name="AgentNames" type="xs:string" use="optional" /> </xs:complexType> </xs:element> <xs:element minOccurs="0" name="CrawlingModel"> <xs:complexType> <xs:attribute name="Type" type="ModelType" use="required" /> <xs:attribute name="Value" type="xs:positiveInteger" use="required" /> </xs:complexType> </xs:element> <xs:element minOccurs="0" name="CrawlScope"> <xs:complexType> <xs:sequence> <xs:element minOccurs="0" name="Filters"> <xs:complexType> <xs:sequence> <xs:element maxOccurs="unbounded" name="Filter"> <xs:complexType> <xs:complexContent mixed="false"> <xs:extension base="Filter" /> </xs:complexContent> </xs:complexType> </xs:element> </xs:sequence> </xs:complexType> </xs:element> </xs:sequence> <xs:attribute default="Host" name="Type" type="CrawlScope" use="optional" /> </xs:complexType> </xs:element> <xs:element minOccurs="0" name="CrawlLimits"> <xs:complexType> <xs:sequence> <xs:element minOccurs="0" name="SizeLimits"> <xs:complexType> <xs:attribute default="0" name="MaxBytesDownload" type="xs:integer" use="optional" /> <xs:attribute default="0" name="MaxDocumentDownload" type="xs:integer" use="optional" /> <xs:attribute default="0" name="MaxTimeSec" type="xs:integer" use="optional" /> <xs:attribute default="0" name="MaxLengthBytes" type="xs:integer" use="optional" /> <xs:attribute default="0" name="LimitRate" type="xs:integer" use="optional" /> </xs:complexType> </xs:element> <xs:element minOccurs="0" name="TimeoutLimits"> <xs:complexType> <xs:attribute default="0" name="Timeout" type="xs:integer" use="optional" /> <xs:attribute default="0" name="DnsTimeout" type="xs:integer" use="optional" /> <xs:attribute default="0" name="ConnectTimeout" type="xs:integer" use="optional" /> <xs:attribute default="900" name="ReadTimeout" type="xs:integer" use="optional" /> </xs:complexType> </xs:element> <xs:element minOccurs="0" name="WaitLimits"> <xs:complexType> <xs:attribute default="0" name="Wait" type="xs:integer" use="optional" /> <xs:attribute default="0" name="RandomWait" type="xs:boolean" use="optional" /> <xs:attribute default="0" name="WaitRetry" type="xs:integer" use="optional" /> <xs:attribute default="0" name="MaxRetries" type="xs:integer" use="optional" /> </xs:complexType> </xs:element> </xs:sequence> </xs:complexType> </xs:element> <xs:element minOccurs="0" name="Proxy"> <xs:complexType> <xs:choice> <xs:element name="ProxyServer"> <xs:complexType> <xs:attribute name="Host" type="xs:string" use="required" /> <xs:attribute name="Port" type="xs:string" use="required" /> <xs:attribute default="" name="Login" type="xs:string" use="optional" /> <xs:attribute default="" name="Password" type="xs:string" use="optional" /> </xs:complexType> </xs:element> <xs:element name="AutomaticConfiguration"> <xs:complexType> <xs:attribute name="Address" type="xs:string" use="required" /> </xs:complexType> </xs:element> </xs:choice> </xs:complexType> </xs:element> <xs:element minOccurs="0" name="Authentication"> <xs:complexType> <xs:sequence> <xs:element minOccurs="0" maxOccurs="unbounded" name="Rfc2617"> <xs:complexType> <xs:attribute name="Host" type="xs:string" use="required" /> <xs:attribute name="Port" type="xs:string" use="required" /> <xs:attribute name="Realm" type="xs:string" use="required" /> <xs:attribute name="Login" type="xs:string" use="required" /> <xs:attribute name="Password" type="xs:string" use="required" /> </xs:complexType> </xs:element> <xs:element minOccurs="0" maxOccurs="unbounded" name="HtmlForm"> <xs:complexType> <xs:sequence> <xs:element name="FormElements"> <xs:complexType> <xs:sequence> <xs:element maxOccurs="unbounded" name="FormElement"> <xs:complexType> <xs:attribute name="Key" type="xs:string" use="required" /> <xs:attribute name="Value" type="xs:string" use="required" /> </xs:complexType> </xs:element> </xs:sequence> </xs:complexType> </xs:element> </xs:sequence> <xs:attribute name="CredentialDomain" type="xs:string" use="required" /> <xs:attribute name="LoginUri" type="xs:string" use="required" /> <xs:attribute name="HttpMethod" type="HttpMethod" use="required" /> </xs:complexType> </xs:element> <xs:element minOccurs="0" maxOccurs="unbounded" name="SslCertificate"> <xs:complexType> <xs:attribute name="ProtocolName" type="xs:string" use="required" /> <xs:attribute name="Port" type="xs:string" use="required" /> <xs:attribute name="TruststoreUrl" type="xs:string" use="required" /> <xs:attribute default="" name="TruststorePassword" type="xs:string" use="optional" /> <xs:attribute name="KeystoreUrl" type="xs:string" use="required" /> <xs:attribute default="" name="KeystorePassword" type="xs:string" use="optional" /> </xs:complexType> </xs:element> </xs:sequence> </xs:complexType> </xs:element> <xs:element minOccurs="0" name="Ssl"> <xs:complexType> <xs:attribute name="TruststoreUrl" type="xs:string" use="required" /> <xs:attribute default="" name="TruststorePassword" type="xs:string" use="optional" /> </xs:complexType> </xs:element> <xs:element name="Seeds"> <xs:complexType> <xs:sequence> <xs:element maxOccurs="unbounded" name="Seed" type="xs:string" /> </xs:sequence> <xs:attribute default="Follow" name="FollowLinks" type="FollowLinksType" use="optional" /> </xs:complexType> </xs:element> <xs:element minOccurs="0" name="Filters"> <xs:complexType> <xs:sequence> <xs:element maxOccurs="unbounded" name="Filter"> <xs:complexType> <xs:complexContent mixed="false"> <xs:extension base="Filter"> <xs:sequence> <xs:element minOccurs="0" name="Refinements"> <xs:complexType> <xs:sequence> <xs:element minOccurs="0" name="TimeOfDay"> <xs:complexType> <xs:attribute name="From" type="xs:time" use="required" /> <xs:attribute name="To" type="xs:time" use="required" /> </xs:complexType> </xs:element> <xs:element minOccurs="0" name="Port"> <xs:complexType> <xs:attribute name="Number" type="xs:integer" use="required" /> </xs:complexType> </xs:element> </xs:sequence> </xs:complexType> </xs:element> </xs:sequence> </xs:extension> </xs:complexContent> </xs:complexType> </xs:element> </xs:sequence> </xs:complexType> </xs:element> <xs:element minOccurs="0" name="MetaTagFilters"> <xs:complexType> <xs:sequence> <xs:element maxOccurs="unbounded" name="MetaTagFilter"> <xs:complexType> <xs:attribute name="Type" type="HtmlMetaTagType" use="required" /> <xs:attribute name="Name" type="xs:string" use="required" /> <xs:attribute name="Content" type="xs:string" use="required" /> <xs:attribute name="WorkType" type="FilterWorkType" use="required" /> </xs:complexType> </xs:element> </xs:sequence> </xs:complexType> </xs:element> </xs:sequence> <xs:attribute name="ProjectName" type="xs:string" use="required" /> <xs:attribute default="false" name="Sitemaps" type="xs:boolean" use="optional" /> <xs:attribute default="" name="Header" type="xs:string" use="optional" /> <xs:attribute default="" name="Referer" type="xs:string" use="optional" /> <xs:attribute default="true" name="EnableCookies" type="xs:boolean" use="optional" /> </xs:complexType> <xs:complexType name="Filter"> <xs:attribute name="WorkType" type="FilterWorkType" use="required" /> <xs:attribute name="Value" type="xs:string" use="required" /> <xs:attribute name="Type" type="FilterType" use="required" /> </xs:complexType> </xs:schema>