Jump to: navigation, search

SMILA/Documentation/Filesystem Crawler

Filesystem Index Order

Following is an example of a Filesystem Index Order:

<IndexOrderConfiguration
  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:noNamespaceSchemaLocation="../org.eclipse.smila.connectivity.framework.crawler.filesystem/schemas/filesystemIndexOrder.xsd"
>
  <DataSourceID>file</DataSourceID>
  <SchemaID>org.eclipse.smila.connectivity.framework.crawler.filesystem</SchemaID>
  <DataConnectionID>
    <Crawler>FileSystemCrawlerDS</Crawler>
  </DataConnectionID>
  <CompoundHandling>Yes</CompoundHandling>
  <Attributes>
    <Attribute Type="Date" Name="LastModifiedDate" HashAttribute="true">
      <FileAttributes>LastModifiedDate</FileAttributes>
    </Attribute>
    <Attribute Type="String" Name="Filename">
      <FileAttributes>Name</FileAttributes>
    </Attribute>
    <Attribute Type="String" Name="Path" KeyAttribute="true">
      <FileAttributes>Path</FileAttributes>
    </Attribute>
    <Attribute Type="String" Name="Content" Attachment="true">
      <FileAttributes>Content</FileAttributes>
    </Attribute>
    <Attribute Type="String" Name="Extension">
      <FileAttributes>FileExtension</FileAttributes>
    </Attribute>
    <Attribute Type="String" Name="Size">
      <FileAttributes>Size</FileAttributes>
    </Attribute>    
    <Attribute Type="String" Name="AccessTreeNotExpanded">
      <AccessTree ExpandAccounts="false"/>
    </Attribute>
    <Attribute Type="String" Name="AccessTreeExpanded">
      <AccessTree ExpandAccounts="true"/>
    </Attribute>
    <Attribute Type="String" Name="AccessListNotExpanded">
      <AccessList ExpandAccounts="false" Mask=" W "/>
    </Attribute>
    <Attribute Type="String" Name="AccessListExpanded">
      <AccessList ExpandAccounts="true" Mask=" W "/>
    </Attribute>
  </Attributes>
  <Process>
    <BaseDir>c:\data</BaseDir>
    <Filter Recursive="true" CaseSensitive="false">
      <Include Name="*.txt"/>
      <Include Name="*.htm"/>
      <Include Name="*.html"/>
      <Include Name="*.xml"/>      
    </Filter>
  </Process>
</IndexOrderConfiguration>

XSD Schema used for Filesystem Crawler

<xs:schema elementFormDefault="qualified" attributeFormDefault="unqualified" xmlns:xs="http://www.w3.org/2001/XMLSchema">
  <xs:redefine schemaLocation="../../org.eclipse.smila.connectivity.framework.indexorder/schemas/RootIndexOrderConfiguration.xsd">
    <xs:complexType name="Process">
      <xs:annotation>
        <xs:documentation>Process Specification</xs:documentation>
      </xs:annotation>
      <xs:complexContent>
        <xs:extension base="Process">
          <xs:sequence maxOccurs="unbounded">
            <xs:element name="BaseDir" type="xs:string"/>
            <xs:element name="Filter">
              <xs:complexType>
                <xs:sequence>
                  <xs:element name="Include" minOccurs="0" maxOccurs="unbounded">
                    <xs:complexType>
                      <xs:attribute name="Name" type="xs:string" use="required"/>
                      <xs:attribute name="DateFrom" type="xs:dateTime" use="optional"/>
                      <xs:attribute name="DateTo" type="xs:dateTime" use="optional"/>
                    </xs:complexType>
                  </xs:element>
                  <xs:element name="Exclude" minOccurs="0" maxOccurs="unbounded">
                    <xs:complexType>
                      <xs:attribute name="Name" type="xs:string" use="required"/>
                    </xs:complexType>
                  </xs:element>
                </xs:sequence>
                <xs:attribute name="CaseSensitive" type="xs:boolean" use="optional" default="false"/>
                <xs:attribute name="Recursive" type="xs:boolean" use="optional" default="true"/>
              </xs:complexType>
            </xs:element>
          </xs:sequence>
        </xs:extension>
      </xs:complexContent>
    </xs:complexType>
    <xs:complexType name="Attribute">
      <xs:complexContent>
        <xs:extension base="Attribute">
          <xs:choice>
            <xs:element name="FileAttributes" type="FileAttributesType" />
            <xs:element name="AccessTree" type="AccessTreeType" />
            <xs:element name="AccessList" type="AccessListType" />
          </xs:choice>
        </xs:extension>
      </xs:complexContent>
    </xs:complexType>
  </xs:redefine>
 
 
  <!-- simple types -->
  <xs:simpleType name="FileAttributesType">
    <xs:restriction base="xs:string">
      <xs:enumeration value="Name"/>
      <xs:enumeration value="Path"/>
      <xs:enumeration value="Size"/>
      <xs:enumeration value="LastModifiedDate"/>
      <xs:enumeration value="Content"/>
      <xs:enumeration value="FileExtension"/>
    </xs:restriction>
  </xs:simpleType>
  <xs:simpleType name="AuthorityType">
    <xs:restriction base="xs:string">
      <xs:enumeration value="USERS"/>
      <xs:enumeration value="GROUPS"/>
    </xs:restriction>
  </xs:simpleType>
  <xs:simpleType name="MaskType">
    <xs:restriction base="xs:string">
      <xs:pattern value="(R|\s)(W|\s)(X|\s)" />
    </xs:restriction>
  </xs:simpleType>
 
 
  <!-- complex types -->
  <xs:complexType name="AccessTreeType">
    <xs:attribute name="ExpandAccounts" type="xs:boolean" use="required"/>
  </xs:complexType>
 
  <xs:complexType name="AccessListType">
    <xs:complexContent>
      <xs:extension base="AccessTreeType">
        <xs:attribute name="Mask" type="MaskType" use="required"/>
        <xs:attribute name="AuthorityFilter" type="AuthorityType" use="optional"/>
      </xs:extension>
    </xs:complexContent>
  </xs:complexType>
 
</xs:schema>

Attribute element

FileAttributes. The FileAttributes element describes the file simple information that should be crawled. There are options to configure:

  1. Name: the file name.
  2. Path: the file complete path.
  3. FileExtension: the file extension.
  4. Size: the file size.
  5. LastModifiedDate: the file last modification date.
  6. Content: the content of the file is emitted as byte[].
<Attribute Type="Date" Name="LastModifiedDate" HashAttribute="true">
  <FileAttributes>LastModifiedDate</FileAttributes>
</Attribute>
<Attribute Type="String" Name="Filename">
  <FileAttributes>Name</FileAttributes>
</Attribute>
<Attribute Type="String" Name="Path" KeyAttribute="true">
  <FileAttributes>Path</FileAttributes>
</Attribute>
<Attribute Type="String" Name="Content" Attachment="true">
  <FileAttributes>Content</FileAttributes>
</Attribute>
<Attribute Type="String" Name="Extension">
  <FileAttributes>FileExtension</FileAttributes>
</Attribute>
<Attribute Type="String" Name="Size">
  <FileAttributes>Size</FileAttributes>
</Attribute>

Security information

AccessTree.

The AccessTree element used to extract raw access control list (ACL) information from file. Security information separated to access rights information MObjects ( read/write/execute , allow/deny type ) and security account information MObjects ( SID, domain/computer, authentication's name ).

There is only one boolean parameter to configure ExpandAccounts. If it's configured to true, then security account groups will be expanded - sub-accounts will be extracted too as sub-MObjects.

For example, file is accessible only by Administrators group.

Configuration sample with ExpandAccounts="false"

<Attribute Type="String" Name="AccessTreeNotExpanded">
  <AccessTree ExpandAccounts="false"/>
</Attribute>

Extracted attribute sample with ExpandAccounts="false"

<A n="AccessTreeNotExpanded">
  <O>
    <A n="type">
      <L>
        <V>ALLOW</V>
      </L>
    </A>
    <A n="mask">
      <L>
        <V>RWX</V>
      </L>
    </A>
    <A n="account">
      <O>
        <A n="sid">
          <L>
            <V>S-1-5-32-544</V>
          </L>
        </A>
        <A n="type">
          <L>
            <V>ALIAS</V>
          </L>
        </A>
        <A n="domain">
          <L>
            <V>BUILTIN</V>
          </L>
        </A>
        <A n="auth">
          <L>
            <V>Administrators</V>
          </L>
        </A>
      </O>
    </A>
  </O>
</A>

Top MObject corresponds to ACL object.

There are three attributes: type - ACL rule type, may be ALLOW or DENY

mask - ACL rule mask, R - Read, W - Write , X - eXecute

account - reference to security account MObject

Security account MObject atributes:

sid - security identifier.

type - account type

domain - account domain/computer name ( 1st level authentication's name )

auth - account name ( 2nd level authentication's name )


Configuration sample with ExpandAccounts="true"

<Attribute Type="String" Name="AccessTreeExpanded">
  <AccessTree ExpandAccounts="true"/>
</Attribute>

Extracted attribute sample with ExpandAccounts="true"

<A n="AccessTreeExpanded">
  <O>
    <A n="type">
      <L>
        <V>ALLOW</V>
      </L>
    </A>
    <A n="mask">
      <L>
        <V>RWX</V>
      </L>
    </A>
    <A n="account">
      <O>
        <A n="sid">
          <L>
            <V>S-1-5-32-544</V>
          </L>
        </A>
        <A n="type">
          <L>
            <V>ALIAS</V>
          </L>
        </A>
        <A n="domain">
          <L>
            <V>BUILTIN</V>
          </L>
        </A>
        <A n="auth">
          <L>
            <V>Administrators</V>
          </L>
        </A>
        <A n="sub">
          <O>
            <A n="sid">
              <L>
                <V>S-1-5-21-2105471877-1027867990-1527921536-500</V>
              </L>
            </A>
            <A n="type">
              <L>
                <V>USER</V>
              </L>
            </A>
            <A n="domain">
              <L>
                <V>Ivan</V>
              </L>
            </A>
            <A n="auth">
              <L>
                <V>Administrator</V>
              </L>
            </A>
          </O>
          <O>
            <A n="sid">
              <L>
                <V>S-1-5-21-2105471877-1027867990-1527921536-1000</V>
              </L>
            </A>
            <A n="type">
              <L>
                <V>USER</V>
              </L>
            </A>
            <A n="domain">
              <L>
                <V>Ivan</V>
              </L>
            </A>
            <A n="auth">
              <L>
                <V>Ivanhoe</V>
              </L>
            </A>
          </O>
        </A>
      </O>
    </A>
  </O>
</A>

Two accounts info extracted for group Administrators. Group Administrators account now have additional attribute sub to group sub-accounts


AccessList. AccessList attribute configuration. This attribute used to extract ready/flat accounts list correspondent to ACL. Additional parameters used to filter required accounts.

ExpandAccounts - should we process sub-accounts or not.

AuthorityFilter - GROUPS/USERS return only groups or users - optional.

Mask - rights filter: Read, Write, eXecute (RWX)

For example, we have to extract accounts and for all of them it's allowed to execute this file.

Sample Configuration for accounts directly linked to file ACL:

<Attribute Type="String" Name="allowed2eXecute">
  <AccessList ExpandAccounts="false" Mask="  X"/>
</Attribute>

Sample Result:

<A n="itsAllowed2Write">
  <O>
    <A n="sid">
      <L>
        <V>S-1-5-32-544</V>
      </L>
    </A>
    <A n="type">
      <L>
        <V>ALIAS</V>
      </L>
    </A>
    <A n="domain">
      <L>
        <V>BUILTIN</V>
      </L>
    </A>
    <A n="auth">
      <L>
        <V>Administrators</V>
      </L>
    </A>
  </O>
</A>

Sample Configuration for all accounts and sub/accounts

<Attribute Type="String" Name="allowed2eXecute_ALL">
  <AccessList ExpandAccounts="true" Mask="  X"/>
</Attribute>

Sample Result:

<A n="itsAllowed2Write_ALL">
  <O>
    <A n="sid">
      <L>
        <V>S-1-5-32-544</V>
      </L>
    </A>
    <A n="type">
      <L>
        <V>ALIAS</V>
      </L>
    </A>
    <A n="domain">
      <L>
        <V>BUILTIN</V>
      </L>
    </A>
    <A n="auth">
      <L>
        <V>Administrators</V>
      </L>
    </A>
  </O>
  <O>
    <A n="sid">
      <L>
        <V>S-1-5-21-2105471877-1027867990-1527921536-500</V>
      </L>
    </A>
    <A n="type">
      <L>
        <V>USER</V>
      </L>
    </A>
    <A n="domain">
      <L>
        <V>Ivan</V>
      </L>
    </A>
    <A n="auth">
      <L>
        <V>Administrator</V>
      </L>
    </A>
  </O>
  <O>
    <A n="sid">
      <L>
        <V>S-1-5-21-2105471877-1027867990-1527921536-1000</V>
      </L>
    </A>
    <A n="type">
      <L>
        <V>USER</V>
      </L>
    </A>
    <A n="domain">
      <L>
        <V>Ivan</V>
      </L>
    </A>
    <A n="auth">
      <L>
        <V>Ivanhoe</V>
      </L>
    </A>
  </O>
</A>