Skip to main content

Notice: this Wiki will be going read only early in 2024 and edits will no longer be possible. Please see: https://gitlab.eclipse.org/eclipsefdn/helpdesk/-/wikis/Wiki-shutdown-plan for the plan.

Jump to: navigation, search

Difference between revisions of "SMILA/Documentation/Filesystem Crawler"

Line 145: Line 145:
  
 
</xs:schema>
 
</xs:schema>
 +
</source>
 +
 +
== Attribute element ==
 +
'''FileAttributes.'''
 +
The FileAttributes element describes the file simple information that should be crawler. Following options exists:
 +
# Name: the file name.
 +
# Path: the file complete path.
 +
# FileExtension: the file extension.
 +
# Size: the file size.
 +
# LastModifiedDate: the file last modification date.
 +
# Content: the content of the file is emitted as byte[].
 +
<source lang="xml">
 +
<Attribute Type="Date" Name="LastModifiedDate" HashAttribute="true">
 +
  <FileAttributes>LastModifiedDate</FileAttributes>
 +
</Attribute>
 +
<Attribute Type="String" Name="Filename">
 +
  <FileAttributes>Name</FileAttributes>
 +
</Attribute>
 +
<Attribute Type="String" Name="Path" KeyAttribute="true">
 +
  <FileAttributes>Path</FileAttributes>
 +
</Attribute>
 +
<Attribute Type="String" Name="Content" Attachment="true">
 +
  <FileAttributes>Content</FileAttributes>
 +
</Attribute>
 +
<Attribute Type="String" Name="Extension">
 +
  <FileAttributes>FileExtension</FileAttributes>
 +
</Attribute>
 +
<Attribute Type="String" Name="Size">
 +
  <FileAttributes>Size</FileAttributes>
 +
</Attribute>   
 
</source>
 
</source>

Revision as of 10:00, 13 January 2009

Filesystem Index Order

Following is an example of a Filesystem Index Order:

<IndexOrderConfiguration
  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:noNamespaceSchemaLocation="../org.eclipse.smila.connectivity.framework.crawler.filesystem/schemas/filesystemIndexOrder.xsd"
>
  <DataSourceID>file</DataSourceID>
  <SchemaID>org.eclipse.smila.connectivity.framework.crawler.filesystem</SchemaID>
  <DataConnectionID>
    <Crawler>FileSystemCrawlerDS</Crawler>
  </DataConnectionID>
  <CompoundHandling>Yes</CompoundHandling>
  <Attributes>
    <Attribute Type="Date" Name="LastModifiedDate" HashAttribute="true">
      <FileAttributes>LastModifiedDate</FileAttributes>
    </Attribute>
    <Attribute Type="String" Name="Filename">
      <FileAttributes>Name</FileAttributes>
    </Attribute>
    <Attribute Type="String" Name="Path" KeyAttribute="true">
      <FileAttributes>Path</FileAttributes>
    </Attribute>
    <Attribute Type="String" Name="Content" Attachment="true">
      <FileAttributes>Content</FileAttributes>
    </Attribute>
    <Attribute Type="String" Name="Extension">
      <FileAttributes>FileExtension</FileAttributes>
    </Attribute>
    <Attribute Type="String" Name="Size">
      <FileAttributes>Size</FileAttributes>
    </Attribute>    
    <Attribute Type="String" Name="AccessTreeNotExpanded">
      <AccessTree ExpandAccounts="false"/>
    </Attribute>
    <Attribute Type="String" Name="AccessTreeExpanded">
      <AccessTree ExpandAccounts="true"/>
    </Attribute>
    <Attribute Type="String" Name="AccessListNotExpanded">
      <AccessList ExpandAccounts="false" Mask=" W "/>
    </Attribute>
    <Attribute Type="String" Name="AccessListExpanded">
      <AccessList ExpandAccounts="true" Mask=" W "/>
    </Attribute>
  </Attributes>
  <Process>
    <BaseDir>c:\data</BaseDir>
    <Filter Recursive="true" CaseSensitive="false">
      <Include Name="*.txt"/>
      <Include Name="*.htm"/>
      <Include Name="*.html"/>
      <Include Name="*.xml"/>      
    </Filter>
  </Process>
</IndexOrderConfiguration>

XSD Schema used for Filesystem Crawler

<xs:schema elementFormDefault="qualified" attributeFormDefault="unqualified" xmlns:xs="http://www.w3.org/2001/XMLSchema">
  <xs:redefine schemaLocation="../../org.eclipse.smila.connectivity.framework.indexorder/schemas/RootIndexOrderConfiguration.xsd">
    <xs:complexType name="Process">
      <xs:annotation>
        <xs:documentation>Process Specification</xs:documentation>
      </xs:annotation>
      <xs:complexContent>
        <xs:extension base="Process">
          <xs:sequence maxOccurs="unbounded">
            <xs:element name="BaseDir" type="xs:string"/>
            <xs:element name="Filter">
              <xs:complexType>
                <xs:sequence>
                  <xs:element name="Include" minOccurs="0" maxOccurs="unbounded">
                    <xs:complexType>
                      <xs:attribute name="Name" type="xs:string" use="required"/>
                      <xs:attribute name="DateFrom" type="xs:dateTime" use="optional"/>
                      <xs:attribute name="DateTo" type="xs:dateTime" use="optional"/>
                    </xs:complexType>
                  </xs:element>
                  <xs:element name="Exclude" minOccurs="0" maxOccurs="unbounded">
                    <xs:complexType>
                      <xs:attribute name="Name" type="xs:string" use="required"/>
                    </xs:complexType>
                  </xs:element>
                </xs:sequence>
                <xs:attribute name="CaseSensitive" type="xs:boolean" use="optional" default="false"/>
                <xs:attribute name="Recursive" type="xs:boolean" use="optional" default="true"/>
              </xs:complexType>
            </xs:element>
          </xs:sequence>
        </xs:extension>
      </xs:complexContent>
    </xs:complexType>
    <xs:complexType name="Attribute">
      <xs:complexContent>
        <xs:extension base="Attribute">
          <xs:choice>
            <xs:element name="FileAttributes" type="FileAttributesType" />
            <xs:element name="AccessTree" type="AccessTreeType" />
            <xs:element name="AccessList" type="AccessListType" />
          </xs:choice>
        </xs:extension>
      </xs:complexContent>
    </xs:complexType>
  </xs:redefine>
 
 
  <!-- simple types -->
  <xs:simpleType name="FileAttributesType">
    <xs:restriction base="xs:string">
      <xs:enumeration value="Name"/>
      <xs:enumeration value="Path"/>
      <xs:enumeration value="Size"/>
      <xs:enumeration value="LastModifiedDate"/>
      <xs:enumeration value="Content"/>
      <xs:enumeration value="FileExtension"/>
    </xs:restriction>
  </xs:simpleType>
  <xs:simpleType name="AuthorityType">
    <xs:restriction base="xs:string">
      <xs:enumeration value="USERS"/>
      <xs:enumeration value="GROUPS"/>
    </xs:restriction>
  </xs:simpleType>
  <xs:simpleType name="MaskType">
    <xs:restriction base="xs:string">
      <xs:pattern value="(R|\s)(W|\s)(X|\s)" />
    </xs:restriction>
  </xs:simpleType>
 
 
  <!-- complex types -->
  <xs:complexType name="AccessTreeType">
    <xs:attribute name="ExpandAccounts" type="xs:boolean" use="required"/>
  </xs:complexType>
 
  <xs:complexType name="AccessListType">
    <xs:complexContent>
      <xs:extension base="AccessTreeType">
        <xs:attribute name="Mask" type="MaskType" use="required"/>
        <xs:attribute name="AuthorityFilter" type="AuthorityType" use="optional"/>
      </xs:extension>
    </xs:complexContent>
  </xs:complexType>
 
</xs:schema>

Attribute element

FileAttributes. The FileAttributes element describes the file simple information that should be crawler. Following options exists:

  1. Name: the file name.
  2. Path: the file complete path.
  3. FileExtension: the file extension.
  4. Size: the file size.
  5. LastModifiedDate: the file last modification date.
  6. Content: the content of the file is emitted as byte[].
<Attribute Type="Date" Name="LastModifiedDate" HashAttribute="true">
  <FileAttributes>LastModifiedDate</FileAttributes>
</Attribute>
<Attribute Type="String" Name="Filename">
  <FileAttributes>Name</FileAttributes>
</Attribute>
<Attribute Type="String" Name="Path" KeyAttribute="true">
  <FileAttributes>Path</FileAttributes>
</Attribute>
<Attribute Type="String" Name="Content" Attachment="true">
  <FileAttributes>Content</FileAttributes>
</Attribute>
<Attribute Type="String" Name="Extension">
  <FileAttributes>FileExtension</FileAttributes>
</Attribute>
<Attribute Type="String" Name="Size">
  <FileAttributes>Size</FileAttributes>
</Attribute>

Back to the top