|
|
Line 1: |
Line 1: |
− | == Filesystem Index Order == | + | == What does FileSystemCrawler do == |
− | Following is an example of a Filesystem Index Order:
| + | |
− | <source lang="xml">
| + | |
− | <IndexOrderConfiguration
| + | |
− | xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
| + | |
− | xsi:noNamespaceSchemaLocation="../org.eclipse.smila.connectivity.framework.crawler.filesystem/schemas/filesystemIndexOrder.xsd"
| + | |
− | >
| + | |
− | <DataSourceID>file</DataSourceID>
| + | |
− | <SchemaID>org.eclipse.smila.connectivity.framework.crawler.filesystem</SchemaID>
| + | |
− | <DataConnectionID>
| + | |
− | <Crawler>FileSystemCrawlerDS</Crawler>
| + | |
− | </DataConnectionID>
| + | |
− | <CompoundHandling>Yes</CompoundHandling>
| + | |
− | <Attributes>
| + | |
− | <Attribute Type="Date" Name="LastModifiedDate" HashAttribute="true">
| + | |
− | <FileAttributes>LastModifiedDate</FileAttributes>
| + | |
− | </Attribute>
| + | |
− | <Attribute Type="String" Name="Filename">
| + | |
− | <FileAttributes>Name</FileAttributes>
| + | |
− | </Attribute>
| + | |
− | <Attribute Type="String" Name="Path" KeyAttribute="true">
| + | |
− | <FileAttributes>Path</FileAttributes>
| + | |
− | </Attribute>
| + | |
− | <Attribute Type="String" Name="Content" Attachment="true">
| + | |
− | <FileAttributes>Content</FileAttributes>
| + | |
− | </Attribute>
| + | |
− | <Attribute Type="String" Name="Extension">
| + | |
− | <FileAttributes>FileExtension</FileAttributes>
| + | |
− | </Attribute>
| + | |
− | <Attribute Type="String" Name="Size">
| + | |
− | <FileAttributes>Size</FileAttributes>
| + | |
− | </Attribute>
| + | |
− | <Attribute Type="String" Name="AccessTreeNotExpanded">
| + | |
− | <AccessTree ExpandAccounts="false"/>
| + | |
− | </Attribute>
| + | |
− | <Attribute Type="String" Name="AccessTreeExpanded">
| + | |
− | <AccessTree ExpandAccounts="true"/>
| + | |
− | </Attribute>
| + | |
− | <Attribute Type="String" Name="AccessListNotExpanded">
| + | |
− | <AccessList ExpandAccounts="false" Mask=" W "/>
| + | |
− | </Attribute>
| + | |
− | <Attribute Type="String" Name="AccessListExpanded">
| + | |
− | <AccessList ExpandAccounts="true" Mask=" W "/>
| + | |
− | </Attribute>
| + | |
− | </Attributes>
| + | |
− | <Process>
| + | |
− | <BaseDir>c:\data</BaseDir>
| + | |
− | <Filter Recursive="true" CaseSensitive="false">
| + | |
− | <Include Name="*.txt"/>
| + | |
− | <Include Name="*.htm"/>
| + | |
− | <Include Name="*.html"/>
| + | |
− | <Include Name="*.xml"/>
| + | |
− | </Filter>
| + | |
− | </Process>
| + | |
− | </IndexOrderConfiguration>
| + | |
− | </source>
| + | |
| | | |
− | == XSD Schema used for Filesystem Crawler ==
| + | The FileSystemCrawler collects all files and folders recursively starting from a given directory. Next do the content of files it may gather any file meta information from the following list: |
− | <source lang="xml">
| + | |
− | <xs:schema elementFormDefault="qualified" attributeFormDefault="unqualified" xmlns:xs="http://www.w3.org/2001/XMLSchema">
| + | |
− | <xs:redefine schemaLocation="../../org.eclipse.smila.connectivity.framework.indexorder/schemas/RootIndexOrderConfiguration.xsd">
| + | |
− | <xs:complexType name="Process">
| + | |
− | <xs:annotation>
| + | |
− | <xs:documentation>Process Specification</xs:documentation>
| + | |
− | </xs:annotation>
| + | |
− | <xs:complexContent>
| + | |
− | <xs:extension base="Process">
| + | |
− | <xs:sequence maxOccurs="unbounded">
| + | |
− | <xs:element name="BaseDir" type="xs:string"/>
| + | |
− | <xs:element name="Filter">
| + | |
− | <xs:complexType>
| + | |
− | <xs:sequence>
| + | |
− | <xs:element name="Include" minOccurs="0" maxOccurs="unbounded">
| + | |
− | <xs:complexType>
| + | |
− | <xs:attribute name="Name" type="xs:string" use="required"/>
| + | |
− | <xs:attribute name="DateFrom" type="xs:dateTime" use="optional"/>
| + | |
− | <xs:attribute name="DateTo" type="xs:dateTime" use="optional"/>
| + | |
− | </xs:complexType>
| + | |
− | </xs:element>
| + | |
− | <xs:element name="Exclude" minOccurs="0" maxOccurs="unbounded">
| + | |
− | <xs:complexType>
| + | |
− | <xs:attribute name="Name" type="xs:string" use="required"/>
| + | |
− | </xs:complexType>
| + | |
− | </xs:element>
| + | |
− | </xs:sequence>
| + | |
− | <xs:attribute name="CaseSensitive" type="xs:boolean" use="optional" default="false"/>
| + | |
− | <xs:attribute name="Recursive" type="xs:boolean" use="optional" default="true"/>
| + | |
− | </xs:complexType>
| + | |
− | </xs:element>
| + | |
− | </xs:sequence>
| + | |
− | </xs:extension>
| + | |
− | </xs:complexContent>
| + | |
− | </xs:complexType>
| + | |
− | <xs:complexType name="Attribute">
| + | |
− | <xs:complexContent>
| + | |
− | <xs:extension base="Attribute">
| + | |
− | <xs:choice>
| + | |
− | <xs:element name="FileAttributes" type="FileAttributesType" />
| + | |
− | <xs:element name="AccessTree" type="AccessTreeType" />
| + | |
− | <xs:element name="AccessList" type="AccessListType" />
| + | |
− | </xs:choice>
| + | |
− | </xs:extension>
| + | |
− | </xs:complexContent>
| + | |
− | </xs:complexType>
| + | |
− | </xs:redefine>
| + | |
− |
| + | |
− |
| + | |
− | <!-- simple types -->
| + | |
− | <xs:simpleType name="FileAttributesType">
| + | |
− | <xs:restriction base="xs:string">
| + | |
− | <xs:enumeration value="Name"/>
| + | |
− | <xs:enumeration value="Path"/>
| + | |
− | <xs:enumeration value="Size"/>
| + | |
− | <xs:enumeration value="LastModifiedDate"/>
| + | |
− | <xs:enumeration value="Content"/>
| + | |
− | <xs:enumeration value="FileExtension"/>
| + | |
− | </xs:restriction>
| + | |
− | </xs:simpleType>
| + | |
− | <xs:simpleType name="AuthorityType">
| + | |
− | <xs:restriction base="xs:string">
| + | |
− | <xs:enumeration value="USERS"/>
| + | |
− | <xs:enumeration value="GROUPS"/>
| + | |
− | </xs:restriction>
| + | |
− | </xs:simpleType>
| + | |
− | <xs:simpleType name="MaskType">
| + | |
− | <xs:restriction base="xs:string">
| + | |
− | <xs:pattern value="(R|\s)(W|\s)(X|\s)" />
| + | |
− | </xs:restriction>
| + | |
− | </xs:simpleType>
| + | |
| | | |
| + | * size |
| + | * full path |
| + | * file name only |
| + | * file size |
| + | * last modified date |
| + | * file content |
| + | * file extension |
| | | |
− | <!-- complex types -->
| + | == Crawling configuration == |
− | <xs:complexType name="AccessTreeType">
| + | |
− | <xs:attribute name="ExpandAccounts" type="xs:boolean" use="required"/>
| + | |
− | </xs:complexType>
| + | |
− |
| + | |
− | <xs:complexType name="AccessListType">
| + | |
− | <xs:complexContent>
| + | |
− | <xs:extension base="AccessTreeType">
| + | |
− | <xs:attribute name="Mask" type="MaskType" use="required"/>
| + | |
− | <xs:attribute name="AuthorityFilter" type="AuthorityType" use="optional"/>
| + | |
− | </xs:extension>
| + | |
− | </xs:complexContent>
| + | |
− | </xs:complexType>
| + | |
| | | |
− | </xs:schema>
| + | The configuration file can be found at <tt>configuration/org.eclipse.smila.framework/file</tt>. |
− | </source>
| + | |
− | | + | |
− | == Attribute element ==
| + | |
− | '''FileAttributes.'''
| + | |
− | The FileAttributes element describes the file simple information that should be crawled. There are options to configure: | + | |
− | # Name: the file name.
| + | |
− | # Path: the file complete path.
| + | |
− | # FileExtension: the file extension.
| + | |
− | # Size: the file size.
| + | |
− | # LastModifiedDate: the file last modification date.
| + | |
− | # Content: the content of the file is emitted as byte[].
| + | |
− | <source lang="xml"> | + | |
− | <Attribute Type="Date" Name="LastModifiedDate" HashAttribute="true">
| + | |
− | <FileAttributes>LastModifiedDate</FileAttributes>
| + | |
− | </Attribute>
| + | |
− | <Attribute Type="String" Name="Filename">
| + | |
− | <FileAttributes>Name</FileAttributes>
| + | |
− | </Attribute>
| + | |
− | <Attribute Type="String" Name="Path" KeyAttribute="true">
| + | |
− | <FileAttributes>Path</FileAttributes>
| + | |
− | </Attribute>
| + | |
− | <Attribute Type="String" Name="Content" Attachment="true">
| + | |
− | <FileAttributes>Content</FileAttributes>
| + | |
− | </Attribute>
| + | |
− | <Attribute Type="String" Name="Extension">
| + | |
− | <FileAttributes>FileExtension</FileAttributes>
| + | |
− | </Attribute>
| + | |
− | <Attribute Type="String" Name="Size">
| + | |
− | <FileAttributes>Size</FileAttributes>
| + | |
− | </Attribute>
| + | |
− | </source>
| + | |
− | | + | |
− | '''Security information'''
| + | |
− | | + | |
− | '''AccessTree.'''
| + | |
− | | + | |
− | The AccessTree element used to extract raw access control list (ACL) information from file. Security information separated to access rights information MObjects ( read/write/execute , allow/deny type ) and security account information MObjects ( SID, domain/computer, authentication's name ).
| + | |
− | | + | |
− | There is only one boolean parameter to configure '''ExpandAccounts'''. If it's configured to true, then security account groups will be expanded - sub-accounts will be extracted too as sub-MObjects.
| + | |
− | | + | |
− | For example, file is accessible only by Administrators group.
| + | |
− | | + | |
− | Configuration sample with ExpandAccounts="false"
| + | |
− | <source lang="xml">
| + | |
− | <Attribute Type="String" Name="AccessTreeNotExpanded">
| + | |
− | <AccessTree ExpandAccounts="false"/>
| + | |
− | </Attribute>
| + | |
− | </source>
| + | |
− | | + | |
− | Extracted attribute sample with ExpandAccounts="false"
| + | |
− | <source lang="xml">
| + | |
− | <A n="AccessTreeNotExpanded">
| + | |
− | <O>
| + | |
− | <A n="type">
| + | |
− | <L>
| + | |
− | <V>ALLOW</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | <A n="mask">
| + | |
− | <L>
| + | |
− | <V>RWX</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | <A n="account">
| + | |
− | <O>
| + | |
− | <A n="sid">
| + | |
− | <L>
| + | |
− | <V>S-1-5-32-544</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | <A n="type">
| + | |
− | <L>
| + | |
− | <V>ALIAS</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | <A n="domain">
| + | |
− | <L>
| + | |
− | <V>BUILTIN</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | <A n="auth">
| + | |
− | <L>
| + | |
− | <V>Administrators</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | </O>
| + | |
− | </A>
| + | |
− | </O>
| + | |
− | </A>
| + | |
− | </source>
| + | |
− | | + | |
− | Top MObject corresponds to ACL object.
| + | |
− | | + | |
− | There are three attributes:
| + | |
− | '''type''' - ACL rule type, may be ALLOW or DENY
| + | |
− | | + | |
− | '''mask''' - ACL rule mask, R - Read, W - Write , X - eXecute
| + | |
− | | + | |
− | '''account''' - reference to security account MObject
| + | |
− | | + | |
− | Security account MObject atributes:
| + | |
− | | + | |
− | '''sid''' - security identifier.
| + | |
− | | + | |
− | '''type''' - account type
| + | |
− | | + | |
− | '''domain''' - account domain/computer name ( 1st level authentication's name )
| + | |
− | | + | |
− | '''auth''' - account name ( 2nd level authentication's name )
| + | |
− | | + | |
− | | + | |
− | Configuration sample with ExpandAccounts="true"
| + | |
− | <source lang="xml">
| + | |
− | <Attribute Type="String" Name="AccessTreeExpanded">
| + | |
− | <AccessTree ExpandAccounts="true"/>
| + | |
− | </Attribute>
| + | |
− | </source>
| + | |
− | | + | |
− | Extracted attribute sample with ExpandAccounts="true"
| + | |
− | <source lang="xml">
| + | |
− | <A n="AccessTreeExpanded">
| + | |
− | <O>
| + | |
− | <A n="type">
| + | |
− | <L>
| + | |
− | <V>ALLOW</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | <A n="mask">
| + | |
− | <L>
| + | |
− | <V>RWX</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | <A n="account">
| + | |
− | <O>
| + | |
− | <A n="sid">
| + | |
− | <L>
| + | |
− | <V>S-1-5-32-544</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | <A n="type">
| + | |
− | <L>
| + | |
− | <V>ALIAS</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | <A n="domain">
| + | |
− | <L>
| + | |
− | <V>BUILTIN</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | <A n="auth">
| + | |
− | <L>
| + | |
− | <V>Administrators</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | <A n="sub">
| + | |
− | <O>
| + | |
− | <A n="sid">
| + | |
− | <L>
| + | |
− | <V>S-1-5-21-2105471877-1027867990-1527921536-500</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | <A n="type">
| + | |
− | <L>
| + | |
− | <V>USER</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | <A n="domain">
| + | |
− | <L>
| + | |
− | <V>Ivan</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | <A n="auth">
| + | |
− | <L>
| + | |
− | <V>Administrator</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | </O>
| + | |
− | <O>
| + | |
− | <A n="sid">
| + | |
− | <L>
| + | |
− | <V>S-1-5-21-2105471877-1027867990-1527921536-1000</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | <A n="type">
| + | |
− | <L>
| + | |
− | <V>USER</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | <A n="domain">
| + | |
− | <L>
| + | |
− | <V>Ivan</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | <A n="auth">
| + | |
− | <L>
| + | |
− | <V>Ivanhoe</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | </O>
| + | |
− | </A>
| + | |
− | </O>
| + | |
− | </A>
| + | |
− | </O>
| + | |
− | </A>
| + | |
− | </source>
| + | |
− | Two accounts info extracted for group Administrators.
| + | |
− | Group '''Administrators''' account now have additional attribute
| + | |
− | '''sub''' to group sub-accounts
| + | |
− | | + | |
− | | + | |
− | '''AccessList.'''
| + | |
− | AccessList attribute configuration. This attribute used to extract ready/flat accounts list correspondent to ACL. Additional parameters used to filter required accounts.
| + | |
− | | + | |
− | '''ExpandAccounts''' - should we process sub-accounts or not.
| + | |
− | | + | |
− | '''AuthorityFilter''' - GROUPS/USERS return only groups or users - optional.
| + | |
− | | + | |
− | '''Mask''' - rights filter: Read, Write, eXecute (RWX)
| + | |
− | | + | |
− | For example, we have to extract accounts and for all of them it's allowed to execute this file.
| + | |
− | | + | |
− | Sample Configuration for accounts directly linked to file ACL:
| + | |
− | <source lang="xml">
| + | |
− | <Attribute Type="String" Name="allowed2eXecute">
| + | |
− | <AccessList ExpandAccounts="false" Mask=" X"/>
| + | |
− | </Attribute>
| + | |
− | </source>
| + | |
− | Sample Result:
| + | |
− | <source lang="xml">
| + | |
− | <A n="allowed2eXecute">
| + | |
− | <O>
| + | |
− | <A n="sid">
| + | |
− | <L>
| + | |
− | <V>S-1-5-32-544</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | <A n="type">
| + | |
− | <L>
| + | |
− | <V>ALIAS</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | <A n="domain">
| + | |
− | <L>
| + | |
− | <V>BUILTIN</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | <A n="auth">
| + | |
− | <L>
| + | |
− | <V>Administrators</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | </O>
| + | |
− | </A>
| + | |
− | </source>
| + | |
− | | + | |
− | Sample Configuration for all accounts and sub/accounts
| + | |
− | <source lang="xml">
| + | |
− | <Attribute Type="String" Name="allowed2eXecute_ALL">
| + | |
− | <AccessList ExpandAccounts="true" Mask=" X"/>
| + | |
− | </Attribute>
| + | |
− | </source>
| + | |
− | Sample Result:
| + | |
− | <source lang="xml">
| + | |
− | <A n="allowed2eXecute_ALL">
| + | |
− | <O>
| + | |
− | <A n="sid">
| + | |
− | <L>
| + | |
− | <V>S-1-5-32-544</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | <A n="type">
| + | |
− | <L>
| + | |
− | <V>ALIAS</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | <A n="domain">
| + | |
− | <L>
| + | |
− | <V>BUILTIN</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | <A n="auth">
| + | |
− | <L>
| + | |
− | <V>Administrators</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | </O>
| + | |
− | <O>
| + | |
− | <A n="sid">
| + | |
− | <L>
| + | |
− | <V>S-1-5-21-2105471877-1027867990-1527921536-500</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | <A n="type">
| + | |
− | <L>
| + | |
− | <V>USER</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | <A n="domain">
| + | |
− | <L>
| + | |
− | <V>Ivan</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | <A n="auth">
| + | |
− | <L>
| + | |
− | <V>Administrator</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | </O>
| + | |
− | <O>
| + | |
− | <A n="sid">
| + | |
− | <L>
| + | |
− | <V>S-1-5-21-2105471877-1027867990-1527921536-1000</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | <A n="type">
| + | |
− | <L>
| + | |
− | <V>USER</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | <A n="domain">
| + | |
− | <L>
| + | |
− | <V>Ivan</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | <A n="auth">
| + | |
− | <L>
| + | |
− | <V>Ivanhoe</V>
| + | |
− | </L>
| + | |
− | </A>
| + | |
− | </O>
| + | |
− | </A>
| + | |
− | </source>
| + | |
The FileSystemCrawler collects all files and folders recursively starting from a given directory. Next do the content of files it may gather any file meta information from the following list: