Jump to: navigation, search

Difference between revisions of "SMILA/Documentation/Filesystem Crawler"

m (Replacing page with '== What does FileSystemCrawler do == The FileSystemCrawler collects all files and folders recursively starting from a given directory. Next do the content of files it may gath...')
Line 1: Line 1:
== Filesystem Index Order ==
+
== What does FileSystemCrawler do ==
Following is an example of a Filesystem Index Order:
+
<source lang="xml">
+
<IndexOrderConfiguration
+
  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+
  xsi:noNamespaceSchemaLocation="../org.eclipse.smila.connectivity.framework.crawler.filesystem/schemas/filesystemIndexOrder.xsd"
+
>
+
  <DataSourceID>file</DataSourceID>
+
  <SchemaID>org.eclipse.smila.connectivity.framework.crawler.filesystem</SchemaID>
+
  <DataConnectionID>
+
    <Crawler>FileSystemCrawlerDS</Crawler>
+
  </DataConnectionID>
+
  <CompoundHandling>Yes</CompoundHandling>
+
  <Attributes>
+
    <Attribute Type="Date" Name="LastModifiedDate" HashAttribute="true">
+
      <FileAttributes>LastModifiedDate</FileAttributes>
+
    </Attribute>
+
    <Attribute Type="String" Name="Filename">
+
      <FileAttributes>Name</FileAttributes>
+
    </Attribute>
+
    <Attribute Type="String" Name="Path" KeyAttribute="true">
+
      <FileAttributes>Path</FileAttributes>
+
    </Attribute>
+
    <Attribute Type="String" Name="Content" Attachment="true">
+
      <FileAttributes>Content</FileAttributes>
+
    </Attribute>
+
    <Attribute Type="String" Name="Extension">
+
      <FileAttributes>FileExtension</FileAttributes>
+
    </Attribute>
+
    <Attribute Type="String" Name="Size">
+
      <FileAttributes>Size</FileAttributes>
+
    </Attribute>   
+
    <Attribute Type="String" Name="AccessTreeNotExpanded">
+
      <AccessTree ExpandAccounts="false"/>
+
    </Attribute>
+
    <Attribute Type="String" Name="AccessTreeExpanded">
+
      <AccessTree ExpandAccounts="true"/>
+
    </Attribute>
+
    <Attribute Type="String" Name="AccessListNotExpanded">
+
      <AccessList ExpandAccounts="false" Mask=" W "/>
+
    </Attribute>
+
    <Attribute Type="String" Name="AccessListExpanded">
+
      <AccessList ExpandAccounts="true" Mask=" W "/>
+
    </Attribute>
+
  </Attributes>
+
  <Process>
+
    <BaseDir>c:\data</BaseDir>
+
    <Filter Recursive="true" CaseSensitive="false">
+
      <Include Name="*.txt"/>
+
      <Include Name="*.htm"/>
+
      <Include Name="*.html"/>
+
      <Include Name="*.xml"/>     
+
    </Filter>
+
  </Process>
+
</IndexOrderConfiguration>
+
</source>
+
  
== XSD Schema used for Filesystem Crawler ==
+
The FileSystemCrawler collects all files and folders recursively starting from a given directory. Next do the content of files it may gather any file meta information from the following list:
<source lang="xml">
+
<xs:schema elementFormDefault="qualified" attributeFormDefault="unqualified" xmlns:xs="http://www.w3.org/2001/XMLSchema">
+
  <xs:redefine schemaLocation="../../org.eclipse.smila.connectivity.framework.indexorder/schemas/RootIndexOrderConfiguration.xsd">
+
    <xs:complexType name="Process">
+
      <xs:annotation>
+
        <xs:documentation>Process Specification</xs:documentation>
+
      </xs:annotation>
+
      <xs:complexContent>
+
        <xs:extension base="Process">
+
          <xs:sequence maxOccurs="unbounded">
+
            <xs:element name="BaseDir" type="xs:string"/>
+
            <xs:element name="Filter">
+
              <xs:complexType>
+
                <xs:sequence>
+
                  <xs:element name="Include" minOccurs="0" maxOccurs="unbounded">
+
                    <xs:complexType>
+
                      <xs:attribute name="Name" type="xs:string" use="required"/>
+
                      <xs:attribute name="DateFrom" type="xs:dateTime" use="optional"/>
+
                      <xs:attribute name="DateTo" type="xs:dateTime" use="optional"/>
+
                    </xs:complexType>
+
                  </xs:element>
+
                  <xs:element name="Exclude" minOccurs="0" maxOccurs="unbounded">
+
                    <xs:complexType>
+
                      <xs:attribute name="Name" type="xs:string" use="required"/>
+
                    </xs:complexType>
+
                  </xs:element>
+
                </xs:sequence>
+
                <xs:attribute name="CaseSensitive" type="xs:boolean" use="optional" default="false"/>
+
                <xs:attribute name="Recursive" type="xs:boolean" use="optional" default="true"/>
+
              </xs:complexType>
+
            </xs:element>
+
          </xs:sequence>
+
        </xs:extension>
+
      </xs:complexContent>
+
    </xs:complexType>
+
    <xs:complexType name="Attribute">
+
      <xs:complexContent>
+
        <xs:extension base="Attribute">
+
          <xs:choice>
+
            <xs:element name="FileAttributes" type="FileAttributesType" />
+
            <xs:element name="AccessTree" type="AccessTreeType" />
+
            <xs:element name="AccessList" type="AccessListType" />
+
          </xs:choice>
+
        </xs:extension>
+
      </xs:complexContent>
+
    </xs:complexType>
+
  </xs:redefine>
+
 
+
 
+
  <!-- simple types -->
+
  <xs:simpleType name="FileAttributesType">
+
    <xs:restriction base="xs:string">
+
      <xs:enumeration value="Name"/>
+
      <xs:enumeration value="Path"/>
+
      <xs:enumeration value="Size"/>
+
      <xs:enumeration value="LastModifiedDate"/>
+
      <xs:enumeration value="Content"/>
+
      <xs:enumeration value="FileExtension"/>
+
    </xs:restriction>
+
  </xs:simpleType>
+
  <xs:simpleType name="AuthorityType">
+
    <xs:restriction base="xs:string">
+
      <xs:enumeration value="USERS"/>
+
      <xs:enumeration value="GROUPS"/>
+
    </xs:restriction>
+
  </xs:simpleType>
+
  <xs:simpleType name="MaskType">
+
    <xs:restriction base="xs:string">
+
      <xs:pattern value="(R|\s)(W|\s)(X|\s)" />
+
    </xs:restriction>
+
  </xs:simpleType>
+
  
 +
* size
 +
* full path
 +
* file name only
 +
* file size
 +
* last modified date
 +
* file content
 +
* file extension
  
  <!-- complex types -->
+
== Crawling configuration ==
  <xs:complexType name="AccessTreeType">
+
    <xs:attribute name="ExpandAccounts" type="xs:boolean" use="required"/>
+
  </xs:complexType>
+
 
+
  <xs:complexType name="AccessListType">
+
    <xs:complexContent>
+
      <xs:extension base="AccessTreeType">
+
        <xs:attribute name="Mask" type="MaskType" use="required"/>
+
        <xs:attribute name="AuthorityFilter" type="AuthorityType" use="optional"/>
+
      </xs:extension>
+
    </xs:complexContent>
+
  </xs:complexType>
+
  
</xs:schema>
+
The configuration file can be found at <tt>configuration/org.eclipse.smila.framework/file</tt>.
</source>
+
 
+
== Attribute element ==
+
'''FileAttributes.'''
+
The FileAttributes element describes the file simple information that should be crawled. There are options to configure:
+
# Name: the file name.
+
# Path: the file complete path.
+
# FileExtension: the file extension.
+
# Size: the file size.
+
# LastModifiedDate: the file last modification date.
+
# Content: the content of the file is emitted as byte[].
+
<source lang="xml">
+
<Attribute Type="Date" Name="LastModifiedDate" HashAttribute="true">
+
  <FileAttributes>LastModifiedDate</FileAttributes>
+
</Attribute>
+
<Attribute Type="String" Name="Filename">
+
  <FileAttributes>Name</FileAttributes>
+
</Attribute>
+
<Attribute Type="String" Name="Path" KeyAttribute="true">
+
  <FileAttributes>Path</FileAttributes>
+
</Attribute>
+
<Attribute Type="String" Name="Content" Attachment="true">
+
  <FileAttributes>Content</FileAttributes>
+
</Attribute>
+
<Attribute Type="String" Name="Extension">
+
  <FileAttributes>FileExtension</FileAttributes>
+
</Attribute>
+
<Attribute Type="String" Name="Size">
+
  <FileAttributes>Size</FileAttributes>
+
</Attribute>   
+
</source>
+
 
+
'''Security information'''
+
 
+
'''AccessTree.'''
+
 
+
The AccessTree element used to extract raw access control list (ACL) information from file. Security information separated to access rights information MObjects ( read/write/execute , allow/deny type ) and security account information MObjects ( SID, domain/computer, authentication's name ).
+
 
+
There is only one boolean parameter to configure '''ExpandAccounts'''. If it's configured to true, then security account groups will be expanded - sub-accounts will be extracted too as sub-MObjects.
+
 
+
For example, file is accessible only by Administrators group.
+
 
+
Configuration sample with ExpandAccounts="false"
+
<source lang="xml">
+
<Attribute Type="String" Name="AccessTreeNotExpanded">
+
  <AccessTree ExpandAccounts="false"/>
+
</Attribute>
+
</source>
+
 
+
Extracted attribute sample with ExpandAccounts="false"
+
<source lang="xml">
+
<A n="AccessTreeNotExpanded">
+
  <O>
+
    <A n="type">
+
      <L>
+
        <V>ALLOW</V>
+
      </L>
+
    </A>
+
    <A n="mask">
+
      <L>
+
        <V>RWX</V>
+
      </L>
+
    </A>
+
    <A n="account">
+
      <O>
+
        <A n="sid">
+
          <L>
+
            <V>S-1-5-32-544</V>
+
          </L>
+
        </A>
+
        <A n="type">
+
          <L>
+
            <V>ALIAS</V>
+
          </L>
+
        </A>
+
        <A n="domain">
+
          <L>
+
            <V>BUILTIN</V>
+
          </L>
+
        </A>
+
        <A n="auth">
+
          <L>
+
            <V>Administrators</V>
+
          </L>
+
        </A>
+
      </O>
+
    </A>
+
  </O>
+
</A>
+
</source>
+
 
+
Top MObject corresponds to ACL object.
+
 
+
There are three attributes:
+
'''type''' - ACL rule type, may be ALLOW or DENY
+
 
+
'''mask''' - ACL rule mask, R - Read, W - Write , X - eXecute
+
 
+
'''account''' - reference to security account MObject
+
 
+
Security account MObject atributes:
+
 
+
'''sid'''  - security identifier.
+
 
+
'''type''' - account type
+
 
+
'''domain''' - account domain/computer name ( 1st level authentication's name )
+
 
+
'''auth''' - account name ( 2nd level authentication's name )
+
 
+
 
+
Configuration sample with ExpandAccounts="true"
+
<source lang="xml">
+
<Attribute Type="String" Name="AccessTreeExpanded">
+
  <AccessTree ExpandAccounts="true"/>
+
</Attribute>
+
</source>
+
 
+
Extracted attribute sample with ExpandAccounts="true"
+
<source lang="xml">
+
<A n="AccessTreeExpanded">
+
  <O>
+
    <A n="type">
+
      <L>
+
        <V>ALLOW</V>
+
      </L>
+
    </A>
+
    <A n="mask">
+
      <L>
+
        <V>RWX</V>
+
      </L>
+
    </A>
+
    <A n="account">
+
      <O>
+
        <A n="sid">
+
          <L>
+
            <V>S-1-5-32-544</V>
+
          </L>
+
        </A>
+
        <A n="type">
+
          <L>
+
            <V>ALIAS</V>
+
          </L>
+
        </A>
+
        <A n="domain">
+
          <L>
+
            <V>BUILTIN</V>
+
          </L>
+
        </A>
+
        <A n="auth">
+
          <L>
+
            <V>Administrators</V>
+
          </L>
+
        </A>
+
        <A n="sub">
+
          <O>
+
            <A n="sid">
+
              <L>
+
                <V>S-1-5-21-2105471877-1027867990-1527921536-500</V>
+
              </L>
+
            </A>
+
            <A n="type">
+
              <L>
+
                <V>USER</V>
+
              </L>
+
            </A>
+
            <A n="domain">
+
              <L>
+
                <V>Ivan</V>
+
              </L>
+
            </A>
+
            <A n="auth">
+
              <L>
+
                <V>Administrator</V>
+
              </L>
+
            </A>
+
          </O>
+
          <O>
+
            <A n="sid">
+
              <L>
+
                <V>S-1-5-21-2105471877-1027867990-1527921536-1000</V>
+
              </L>
+
            </A>
+
            <A n="type">
+
              <L>
+
                <V>USER</V>
+
              </L>
+
            </A>
+
            <A n="domain">
+
              <L>
+
                <V>Ivan</V>
+
              </L>
+
            </A>
+
            <A n="auth">
+
              <L>
+
                <V>Ivanhoe</V>
+
              </L>
+
            </A>
+
          </O>
+
        </A>
+
      </O>
+
    </A>
+
  </O>
+
</A>
+
</source>
+
Two accounts info extracted for group Administrators.
+
Group '''Administrators''' account now have additional attribute
+
'''sub''' to group sub-accounts
+
 
+
 
+
'''AccessList.'''
+
AccessList attribute configuration. This attribute used to extract ready/flat accounts list correspondent to ACL. Additional parameters used to filter required accounts.  
+
 
+
'''ExpandAccounts''' - should we process sub-accounts or not.
+
 
+
'''AuthorityFilter''' - GROUPS/USERS return only groups or users - optional.
+
 
+
'''Mask''' - rights filter: Read, Write, eXecute (RWX)
+
 
+
For example, we have to extract accounts and for all of them it's allowed to execute this file.
+
 
+
Sample Configuration for accounts directly linked to file ACL:
+
<source lang="xml">
+
<Attribute Type="String" Name="allowed2eXecute">
+
  <AccessList ExpandAccounts="false" Mask="  X"/>
+
</Attribute>
+
</source>
+
Sample Result:
+
<source lang="xml">
+
<A n="allowed2eXecute">
+
  <O>
+
    <A n="sid">
+
      <L>
+
        <V>S-1-5-32-544</V>
+
      </L>
+
    </A>
+
    <A n="type">
+
      <L>
+
        <V>ALIAS</V>
+
      </L>
+
    </A>
+
    <A n="domain">
+
      <L>
+
        <V>BUILTIN</V>
+
      </L>
+
    </A>
+
    <A n="auth">
+
      <L>
+
        <V>Administrators</V>
+
      </L>
+
    </A>
+
  </O>
+
</A>
+
</source>
+
 
+
Sample Configuration for all accounts and sub/accounts
+
<source lang="xml">
+
<Attribute Type="String" Name="allowed2eXecute_ALL">
+
  <AccessList ExpandAccounts="true" Mask="  X"/>
+
</Attribute>
+
</source>
+
Sample Result:
+
<source lang="xml">
+
<A n="allowed2eXecute_ALL">
+
  <O>
+
    <A n="sid">
+
      <L>
+
        <V>S-1-5-32-544</V>
+
      </L>
+
    </A>
+
    <A n="type">
+
      <L>
+
        <V>ALIAS</V>
+
      </L>
+
    </A>
+
    <A n="domain">
+
      <L>
+
        <V>BUILTIN</V>
+
      </L>
+
    </A>
+
    <A n="auth">
+
      <L>
+
        <V>Administrators</V>
+
      </L>
+
    </A>
+
  </O>
+
  <O>
+
    <A n="sid">
+
      <L>
+
        <V>S-1-5-21-2105471877-1027867990-1527921536-500</V>
+
      </L>
+
    </A>
+
    <A n="type">
+
      <L>
+
        <V>USER</V>
+
      </L>
+
    </A>
+
    <A n="domain">
+
      <L>
+
        <V>Ivan</V>
+
      </L>
+
    </A>
+
    <A n="auth">
+
      <L>
+
        <V>Administrator</V>
+
      </L>
+
    </A>
+
  </O>
+
  <O>
+
    <A n="sid">
+
      <L>
+
        <V>S-1-5-21-2105471877-1027867990-1527921536-1000</V>
+
      </L>
+
    </A>
+
    <A n="type">
+
      <L>
+
        <V>USER</V>
+
      </L>
+
    </A>
+
    <A n="domain">
+
      <L>
+
        <V>Ivan</V>
+
      </L>
+
    </A>
+
    <A n="auth">
+
      <L>
+
        <V>Ivanhoe</V>
+
      </L>
+
    </A>
+
  </O>
+
</A>
+
</source>
+

Revision as of 02:19, 19 March 2009

What does FileSystemCrawler do

The FileSystemCrawler collects all files and folders recursively starting from a given directory. Next do the content of files it may gather any file meta information from the following list:

  • size
  • full path
  • file name only
  • file size
  • last modified date
  • file content
  • file extension

Crawling configuration

The configuration file can be found at configuration/org.eclipse.smila.framework/file.