RapidMiner: filtering out n-grams with n>3 - rapidminer

I have about 2 millions of messages (Data Tables).
I'd like to filter out messages containing frequent X-gram, while X>3.
(frequency in % of all messages)
For example:
Message 1 = "1 2 3 4 5"
Message 2 = "1 2 3 4 6"
Message 3 = "1 2 3"
M1 and M2 both have 4-gram 1_2_3_4, so I wand to exclude them, so the result has to leave only M3.

You can use the text processing extension to find n-grams, count how many are longer than three and add that number to the example set to allow subsequent filtering. You can also retain the original data.
Here's an example that you could copy (note that you have to install the text mining extension from the RapidMiner marketplace)
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="6.5.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="6.5.000" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<parameter key="parallelize_main_process" value="false"/>
<process expanded="true">
<operator activated="true" class="generate_data_user_specification" compatibility="6.5.000" expanded="true" height="60" name="Generate Data by User Specification" width="90" x="45" y="75">
<list key="attribute_values">
<parameter key="message" value=""1 2 3 4""/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="6.5.000" expanded="true" height="60" name="Generate Data by User Specification (2)" width="90" x="45" y="165">
<list key="attribute_values">
<parameter key="message" value=""1 2 3 4 5""/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="6.5.000" expanded="true" height="60" name="Generate Data by User Specification (3)" width="90" x="45" y="255">
<list key="attribute_values">
<parameter key="message" value=""1 2 3""/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="append" compatibility="6.5.000" expanded="true" height="112" name="Append" width="90" x="246" y="75">
<parameter key="datamanagement" value="double_array"/>
<parameter key="merge_type" value="all"/>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="6.5.000" expanded="true" height="76" name="Nominal to Text" width="90" x="380" y="75">
<parameter key="attribute_filter_type" value="all"/>
<parameter key="attribute" value=""/>
<parameter key="attributes" value=""/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="nominal"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="file_path"/>
<parameter key="block_type" value="single_value"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="single_value"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="6.5.000" expanded="true" height="76" name="Process Documents from Data" width="90" x="514" y="75">
<parameter key="create_word_vector" value="false"/>
<parameter key="vector_creation" value="Term Occurrences"/>
<parameter key="add_meta_information" value="true"/>
<parameter key="keep_text" value="true"/>
<parameter key="prune_method" value="none"/>
<parameter key="prune_below_percent" value="3.0"/>
<parameter key="prune_above_percent" value="30.0"/>
<parameter key="prune_below_rank" value="0.05"/>
<parameter key="prune_above_rank" value="0.95"/>
<parameter key="datamanagement" value="double_sparse_array"/>
<parameter key="select_attributes_and_weights" value="false"/>
<list key="specify_weights"/>
<parameter key="parallelize_vector_creation" value="false"/>
<process expanded="true">
<operator activated="true" class="multiply" compatibility="6.5.000" expanded="true" height="94" name="Multiply" width="90" x="44" y="30"/>
<operator activated="true" class="text:tokenize" compatibility="6.5.000" expanded="true" height="60" name="Tokenize" width="90" x="179" y="30">
<parameter key="mode" value="regular expression"/>
<parameter key="characters" value=".:"/>
<parameter key="expression" value="\s"/>
<parameter key="language" value="English"/>
<parameter key="max_token_length" value="3"/>
</operator>
<operator activated="true" class="text:generate_n_grams_terms" compatibility="6.5.000" expanded="true" height="60" name="Generate n-Grams (Terms)" width="90" x="179" y="120">
<parameter key="max_length" value="4"/>
</operator>
<operator activated="true" class="text:filter_tokens_by_content" compatibility="6.5.000" expanded="true" height="60" name="Filter Tokens (by Content)" width="90" x="179" y="210">
<parameter key="condition" value="contains match"/>
<parameter key="regular_expression" value="(_.){3,}"/>
<parameter key="case_sensitive" value="false"/>
<parameter key="invert condition" value="false"/>
</operator>
<operator activated="true" class="text:extract_token_number" compatibility="6.5.000" expanded="true" height="60" name="Extract Token Number" width="90" x="179" y="300">
<parameter key="metadata_key" value="numberOfNGramsGT3"/>
<parameter key="condition" value="all"/>
<parameter key="case_sensitive" value="false"/>
<parameter key="invert_condition" value="false"/>
</operator>
<connect from_port="document" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="Tokenize" to_port="document"/>
<connect from_op="Multiply" from_port="output 2" to_port="document 1"/>
<connect from_op="Tokenize" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
<connect from_op="Generate n-Grams (Terms)" from_port="document" to_op="Filter Tokens (by Content)" to_port="document"/>
<connect from_op="Filter Tokens (by Content)" from_port="document" to_op="Extract Token Number" to_port="document"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="filter_examples" compatibility="6.5.000" expanded="true" height="94" name="Filter Examples" width="90" x="782" y="75">
<parameter key="parameter_expression" value=""/>
<parameter key="condition_class" value="custom_filters"/>
<parameter key="invert_filter" value="false"/>
<list key="filters_list">
<parameter key="filters_entry_key" value="numberOfNGramsGT3.eq.0"/>
</list>
<parameter key="filters_logic_and" value="true"/>
<parameter key="filters_check_metadata" value="true"/>
</operator>
<connect from_op="Generate Data by User Specification" from_port="output" to_op="Append" to_port="example set 1"/>
<connect from_op="Generate Data by User Specification (2)" from_port="output" to_op="Append" to_port="example set 2"/>
<connect from_op="Generate Data by User Specification (3)" from_port="output" to_op="Append" to_port="example set 3"/>
<connect from_op="Append" from_port="merged set" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Filter Examples" to_port="example set input"/>
<connect from_op="Filter Examples" from_port="example set output" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>

Related

Rapidminer count total occurrence and sort by date

I have rapidminer example set like this,
ID Issue Exp
100 9/8/2020 11/8/2020
100 8/5/2019 9/5/2019
101 6/3/2020 10/1/2020
102 8/15/2020 12/12/2020
I want to add a new column which will count the occurrence of the ID by adding the numbers and sort by the earliest date so we know at what date how many count I had.
Output like this,
ID Issue Exp Count
100 8/5/2019 9/5/2019 1
100 9/8/2020 11/8/2020 2
101 6/3/2020 10/1/2020 1
102 8/15/2020 12/12/2020 1
But when I aggregate by ID and do a count it will just count the total instead and show them for the same ID. So, for ID 100 it shows me 2 both the times because it is just adding the numbers both the times.
For example, for ID 100 in 2019 we had only 1 issue date hence count is 1, when we find ID 100 again at 2020 the count will be 2. So, the sort by date is also important because it will help us find the ID occurrence in correct order.
Any help is appreciated.
Thanks.
One approach is to use the Loop Values operator to loop through all the possible values of the ID operator, use this value to filter the example set (which has already been sorted), generate a new incrementing id from this filtered set and finally append all the filtered examples back together.
Here's the process and corresponding XML to do this.
<?xml version="1.0" encoding="UTF-8"?><process version="9.9.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="9.9.000" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="9.9.000" expanded="true" height="68" name="Retrieve occById" width="90" x="45" y="34">
<parameter key="repository_entry" value="//Local Repository/data/occById"/>
</operator>
<operator activated="true" class="blending:sort" compatibility="9.9.000" expanded="true" height="82" name="Sort" width="90" x="179" y="34">
<list key="sort_by">
<parameter key="ID" value="ascending"/>
<parameter key="Issue" value="ascending"/>
</list>
</operator>
<operator activated="true" class="concurrency:loop_values" compatibility="9.9.000" expanded="true" height="82" name="Loop Values" width="90" x="313" y="34">
<parameter key="attribute" value="ID"/>
<parameter key="iteration_macro" value="loop_value"/>
<parameter key="reuse_results" value="false"/>
<parameter key="enable_parallel_execution" value="true"/>
<process expanded="true">
<operator activated="true" class="filter_examples" compatibility="9.9.000" expanded="true" height="103" name="Filter Examples" width="90" x="112" y="34">
<parameter key="parameter_string" value="ID=%{loop_value}"/>
<parameter key="parameter_expression" value=""/>
<parameter key="condition_class" value="attribute_value_filter"/>
<parameter key="invert_filter" value="false"/>
<list key="filters_list">
<parameter key="filters_entry_key" value="ID.eq.%{loop_value}"/>
</list>
<parameter key="filters_logic_and" value="true"/>
<parameter key="filters_check_metadata" value="true"/>
</operator>
<operator activated="true" class="generate_id" compatibility="9.9.000" expanded="true" height="82" name="Generate ID" width="90" x="313" y="34">
<parameter key="create_nominal_ids" value="false"/>
<parameter key="offset" value="0"/>
</operator>
<connect from_port="input 1" to_op="Filter Examples" to_port="example set input"/>
<connect from_op="Filter Examples" from_port="example set output" to_op="Generate ID" to_port="example set input"/>
<connect from_op="Generate ID" from_port="example set output" to_port="output 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="append" compatibility="9.9.000" expanded="true" height="82" name="Append" width="90" x="447" y="34">
<parameter key="datamanagement" value="double_array"/>
<parameter key="data_management" value="auto"/>
<parameter key="merge_type" value="all"/>
</operator>
<connect from_op="Retrieve occById" from_port="output" to_op="Sort" to_port="example set input"/>
<connect from_op="Sort" from_port="example set output" to_op="Loop Values" to_port="input 1"/>
<connect from_op="Loop Values" from_port="output 1" to_op="Append" to_port="example set 1"/>
<connect from_op="Append" from_port="merged set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
The input data was handcrafted and stored with name occById in the local repository - it looks like this.
The result is below.

cross distance in rapidminer give question mark

I am using rapidminer to compare the similarity between two sheets in same excel file using cross distance, as I want to compart one request will all referernce to return the similarity value by cosine similarity, the problem is the distance returns as question mark ? without knowing the reason.
Process XML:
<?xml version="1.0" encoding="UTF-8"?><process version="8.2.001"> <context> <input/> <output/> <macros/> </context> <operator activated="true" class="process" compatibility="8.2.001" expanded="true" name="Process"> <parameter key="logverbosity" value="init"/> <parameter key="random_seed" value="2001"/> <parameter key="send_mail" value="never"/> <parameter key="notification_email" value=""/> <parameter key="process_duration_for_mail" value="30"/> <parameter key="encoding" value="SYSTEM"/> <process expanded="true"> <operator activated="true" class="read_excel" compatibility="8.2.001" expanded="true" height="68" name="Read Excel" width="90" x="45" y="391"> <parameter key="excel_file" value="/Users/macbook/Desktop/ULS/Change Management in ULS/WASP_Requirements.xlsx"/> <parameter key="sheet_selection" value="sheet number"/> <parameter key="sheet_number" value="1"/> <parameter key="imported_cell_range" value="A1:B72"/> <parameter key="encoding" value="SYSTEM"/> <parameter key="first_row_as_names" value="false"/> <list key="annotations"/> <parameter key="date_format" value=""/> <parameter key="time_zone" value="SYSTEM"/> <parameter key="locale" value="English (United States)"/> <parameter key="read_all_values_as_polynominal" value="false"/> <list key="data_set_meta_data_information"> <parameter key="0" value="A.true.polynominal.id"/> <parameter key="1" value="B.true.polynominal.attribute"/> </list> <parameter key="read_not_matching_values_as_missings" value="true"/> <parameter key="datamanagement" value="double_array"/> <parameter key="data_management" value="auto"/> </operator> <operator activated="true" class="remove_duplicates" compatibility="8.2.001" expanded="true" height="103" name="Remove Duplicates" width="90" x="45" y="493"> <parameter key="attribute_filter_type" value="all"/> <parameter key="attribute" value=""/> <parameter key="attributes" value=""/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="attribute_value"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="time"/> <parameter key="block_type" value="attribute_block"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="value_matrix_row_start"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="false"/> <parameter key="treat_missing_values_as_duplicates" value="false"/> </operator> <operator activated="true" class="set_role" compatibility="8.2.001" expanded="true" height="82" name="Set Role" width="90" x="179" y="391"> <parameter key="attribute_name" value="A"/> <parameter key="target_role" value="id"/> <list key="set_additional_roles"> <parameter key="B" value="regular"/> <parameter key="A" value="id"/> </list> </operator> <operator activated="true" class="nominal_to_text" compatibility="8.2.001" expanded="true" height="82" name="Nominal to Text" width="90" x="179" y="493"> <parameter key="attribute_filter_type" value="all"/> <parameter key="attribute" value=""/> <parameter key="attributes" value=""/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="nominal"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="file_path"/> <parameter key="block_type" value="single_value"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="single_value"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="false"/> </operator> <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="313" y="391"> <parameter key="create_word_vector" value="true"/> <parameter key="vector_creation" value="TF-IDF"/> <parameter key="add_meta_information" value="true"/> <parameter key="keep_text" value="true"/> <parameter key="prune_method" value="absolute"/> <parameter key="prune_below_percent" value="3.0"/> <parameter key="prune_above_percent" value="30.0"/> <parameter key="prune_below_absolute" value="2"/> <parameter key="prune_above_absolute" value="9999"/> <parameter key="prune_below_rank" value="0.05"/> <parameter key="prune_above_rank" value="0.95"/> <parameter key="datamanagement" value="double_sparse_array"/> <parameter key="data_management" value="auto"/> <parameter key="select_attributes_and_weights" value="false"/> <list key="specify_weights"> <parameter key="B" value="1.0"/> </list> <process expanded="true"> <operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="112" y="34"> <parameter key="mode" value="linguistic tokens"/> <parameter key="characters" value=".:"/> <parameter key="language" value="English"/> <parameter key="max_token_length" value="3"/> </operator> <operator activated="true" class="text:filter_stopwords_english" compatibility="8.1.000" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="112" y="136"/> <operator activated="true" class="text:transform_cases" compatibility="8.1.000" expanded="true" height="68" name="Transform Cases" width="90" x="112" y="238"> <parameter key="transform_to" value="lower case"/> </operator> <operator activated="true" class="wordnet:open_wordnet_dictionary" compatibility="5.3.000" expanded="true" height="68" name="Open WordNet Dictionary" width="90" x="313" y="391"> <parameter key="resource_type" value="directory"/> <parameter key="directory" value="/Users/macbook/Downloads/WordNet-3.0/dict"/> </operator> <operator activated="true" class="wordnet:stem_wordnet" compatibility="5.3.000" expanded="true" height="82" name="Stem (WordNet)" width="90" x="313" y="238"> <parameter key="allow_ambiguity" value="true"/> <parameter key="keep_unmatched_stems" value="true"/> <parameter key="keep_unmatched_tokens" value="true"/> <parameter key="work_on_type_noun" value="true"/> <parameter key="work_on_type_verb" value="true"/> <parameter key="work_on_type_adjective" value="true"/> <parameter key="work_on_type_adverb" value="true"/> </operator> <operator activated="true" class="wordnet:find_synonym_wordnet" compatibility="5.3.000" expanded="true" height="82" name="Find Synonyms (WordNet)" width="90" x="447" y="238"> <parameter key="use_prefix" value="false"/> <parameter key="synset_word_prefix" value="syn:"/> <parameter key="maximum_recursion_depth" value="1"/> <parameter key="multiple_meanings_per_word_policy" value="Take only first meaning"/> <parameter key="multiple_synsets_policy" value="Take only first synset per meaning"/> <parameter key="multiple_synset_words_policy" value="Take only first synset word"/> <parameter key="concatenation" value="Concatenate result per synset"/> <parameter key="keep_original_tokens" value="true"/> <parameter key="keep_unmatched_tokens" value="true"/> <parameter key="take_ID_instead_of_words" value="false"/> <parameter key="work_on_type_noun" value="true"/> <parameter key="work_on_type_verb" value="true"/> <parameter key="work_on_type_adjective" value="true"/> <parameter key="work_on_type_adverb" value="true"/> </operator> <connect from_port="document" to_op="Tokenize" to_port="document"/> <connect from_op="Tokenize" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/> <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Transform Cases" to_port="document"/> <connect from_op="Transform Cases" from_port="document" to_op="Stem (WordNet)" to_port="document"/> <connect from_op="Open WordNet Dictionary" from_port="dictionary" to_op="Stem (WordNet)" to_port="dictionary"/> <connect from_op="Stem (WordNet)" from_port="document" to_op="Find Synonyms (WordNet)" to_port="document"/> <connect from_op="Stem (WordNet)" from_port="dictionary" to_op="Find Synonyms (WordNet)" to_port="dictionary"/> <connect from_op="Find Synonyms (WordNet)" from_port="document" to_port="document 1"/> <portSpacing port="source_document" spacing="0"/> <portSpacing port="sink_document 1" spacing="0"/> <portSpacing port="sink_document 2" spacing="0"/> </process> </operator> <operator activated="false" class="concurrency:k_means" compatibility="8.2.001" expanded="true" height="82" name="Clustering" width="90" x="782" y="34"> <parameter key="add_cluster_attribute" value="true"/> <parameter key="add_as_label" value="false"/> <parameter key="remove_unlabeled" value="false"/> <parameter key="k" value="40"/> <parameter key="max_runs" value="10"/> <parameter key="determine_good_start_values" value="true"/> <parameter key="measure_types" value="BregmanDivergences"/> <parameter key="mixed_measure" value="MixedEuclideanDistance"/> <parameter key="nominal_measure" value="NominalDistance"/> <parameter key="numerical_measure" value="EuclideanDistance"/> <parameter key="divergence" value="SquaredEuclideanDistance"/> <parameter key="kernel_type" value="radial"/> <parameter key="kernel_gamma" value="1.0"/> <parameter key="kernel_sigma1" value="1.0"/> <parameter key="kernel_sigma2" value="0.0"/> <parameter key="kernel_sigma3" value="2.0"/> <parameter key="kernel_degree" value="3.0"/> <parameter key="kernel_shift" value="1.0"/> <parameter key="kernel_a" value="1.0"/> <parameter key="kernel_b" value="0.0"/> <parameter key="max_optimization_steps" value="100"/> <parameter key="use_local_random_seed" value="false"/> <parameter key="local_random_seed" value="1992"/> </operator> <operator activated="true" class="read_excel" compatibility="8.2.001" expanded="true" height="68" name="Read Excel (2)" width="90" x="45" y="85"> <parameter key="excel_file" value="/Users/macbook/Desktop/ULS/Change Management in ULS/WASP_Requirements.xlsx"/> <parameter key="sheet_selection" value="sheet number"/> <parameter key="sheet_number" value="2"/> <parameter key="imported_cell_range" value="A1:B1"/> <parameter key="encoding" value="SYSTEM"/> <parameter key="first_row_as_names" value="false"/> <list key="annotations"/> <parameter key="date_format" value=""/> <parameter key="time_zone" value="SYSTEM"/> <parameter key="locale" value="English (United States)"/> <parameter key="read_all_values_as_polynominal" value="false"/> <list key="data_set_meta_data_information"> <parameter key="0" value="A.true.polynominal.id"/> <parameter key="1" value="B.true.polynominal.attribute"/> </list> <parameter key="read_not_matching_values_as_missings" value="true"/> <parameter key="datamanagement" value="double_array"/> <parameter key="data_management" value="auto"/> </operator> <operator activated="true" class="remove_duplicates" compatibility="8.2.001" expanded="true" height="103" name="Remove Duplicates (2)" width="90" x="45" y="187"> <parameter key="attribute_filter_type" value="all"/> <parameter key="attribute" value=""/> <parameter key="attributes" value=""/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="attribute_value"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="time"/> <parameter key="block_type" value="attribute_block"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="value_matrix_row_start"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="false"/> <parameter key="treat_missing_values_as_duplicates" value="false"/> </operator> <operator activated="true" class="set_role" compatibility="8.2.001" expanded="true" height="82" name="Set Role (2)" width="90" x="179" y="85"> <parameter key="attribute_name" value="A"/> <parameter key="target_role" value="id"/> <list key="set_additional_roles"> <parameter key="B" value="regular"/> <parameter key="A" value="id"/> </list> </operator> <operator activated="true" class="nominal_to_text" compatibility="8.2.001" expanded="true" height="82" name="Nominal to Text (2)" width="90" x="179" y="187"> <parameter key="attribute_filter_type" value="all"/> <parameter key="attribute" value=""/> <parameter key="attributes" value=""/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="nominal"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="file_path"/> <parameter key="block_type" value="single_value"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="single_value"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="false"/> </operator> <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data (2)" width="90" x="313" y="85"> <parameter key="create_word_vector" value="true"/> <parameter key="vector_creation" value="TF-IDF"/> <parameter key="add_meta_information" value="true"/> <parameter key="keep_text" value="true"/> <parameter key="prune_method" value="absolute"/> <parameter key="prune_below_percent" value="3.0"/> <parameter key="prune_above_percent" value="30.0"/> <parameter key="prune_below_absolute" value="2"/> <parameter key="prune_above_absolute" value="9999"/> <parameter key="prune_below_rank" value="0.05"/> <parameter key="prune_above_rank" value="0.95"/> <parameter key="datamanagement" value="double_sparse_array"/> <parameter key="data_management" value="auto"/> <parameter key="select_attributes_and_weights" value="false"/> <list key="specify_weights"> <parameter key="B" value="1.0"/> </list> <process expanded="true"> <operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize (2)" width="90" x="112" y="34"> <parameter key="mode" value="linguistic tokens"/> <parameter key="characters" value=".:"/> <parameter key="language" value="English"/> <parameter key="max_token_length" value="3"/> </operator> <operator activated="true" class="text:filter_stopwords_english" compatibility="8.1.000" expanded="true" height="68" name="Filter Stopwords (2)" width="90" x="112" y="136"/> <operator activated="true" class="text:transform_cases" compatibility="8.1.000" expanded="true" height="68" name="Transform Cases (2)" width="90" x="112" y="238"> <parameter key="transform_to" value="lower case"/> </operator> <operator activated="true" class="wordnet:open_wordnet_dictionary" compatibility="5.3.000" expanded="true" height="68" name="Open WordNet Dictionary (2)" width="90" x="313" y="391"> <parameter key="resource_type" value="directory"/> <parameter key="directory" value="/Users/macbook/Downloads/WordNet-3.0/dict"/> </operator> <operator activated="true" class="wordnet:stem_wordnet" compatibility="5.3.000" expanded="true" height="82" name="Stem (2)" width="90" x="313" y="238"> <parameter key="allow_ambiguity" value="true"/> <parameter key="keep_unmatched_stems" value="true"/> <parameter key="keep_unmatched_tokens" value="true"/> <parameter key="work_on_type_noun" value="true"/> <parameter key="work_on_type_verb" value="true"/> <parameter key="work_on_type_adjective" value="true"/> <parameter key="work_on_type_adverb" value="true"/> </operator> <operator activated="true" class="wordnet:find_synonym_wordnet" compatibility="5.3.000" expanded="true" height="82" name="Find Synonyms (2)" width="90" x="447" y="238"> <parameter key="use_prefix" value="false"/> <parameter key="synset_word_prefix" value="syn:"/> <parameter key="maximum_recursion_depth" value="1"/> <parameter key="multiple_meanings_per_word_policy" value="Take only first meaning"/> <parameter key="multiple_synsets_policy" value="Take only first synset per meaning"/> <parameter key="multiple_synset_words_policy" value="Take only first synset word"/> <parameter key="concatenation" value="Concatenate result per synset"/> <parameter key="keep_original_tokens" value="true"/> <parameter key="keep_unmatched_tokens" value="true"/> <parameter key="take_ID_instead_of_words" value="false"/> <parameter key="work_on_type_noun" value="true"/> <parameter key="work_on_type_verb" value="true"/> <parameter key="work_on_type_adjective" value="true"/> <parameter key="work_on_type_adverb" value="true"/> </operator> <connect from_port="document" to_op="Tokenize (2)" to_port="document"/> <connect from_op="Tokenize (2)" from_port="document" to_op="Filter Stopwords (2)" to_port="document"/> <connect from_op="Filter Stopwords (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/> <connect from_op="Transform Cases (2)" from_port="document" to_op="Stem (2)" to_port="document"/> <connect from_op="Open WordNet Dictionary (2)" from_port="dictionary" to_op="Stem (2)" to_port="dictionary"/> <connect from_op="Stem (2)" from_port="document" to_op="Find Synonyms (2)" to_port="document"/> <connect from_op="Stem (2)" from_port="dictionary" to_op="Find Synonyms (2)" to_port="dictionary"/> <connect from_op="Find Synonyms (2)" from_port="document" to_port="document 1"/> <portSpacing port="source_document" spacing="0"/> <portSpacing port="sink_document 1" spacing="0"/> <portSpacing port="sink_document 2" spacing="0"/> </process> </operator> <operator activated="true" class="multiply" compatibility="8.2.001" expanded="true" height="103" name="Multiply" width="90" x="447" y="85"/> <operator activated="true" class="order_attributes" compatibility="8.2.001" expanded="true" height="82" name="Reorder Attributes" width="90" x="447" y="238"> <parameter key="sort_mode" value="reference data"/> <parameter key="attribute_ordering" value=""/> <parameter key="use_regular_expressions" value="false"/> <parameter key="handle_unmatched" value="append"/> <parameter key="sort_direction" value="ascending"/> </operator> <operator activated="true" class="multiply" compatibility="8.2.001" expanded="true" height="103" name="Multiply (2)" width="90" x="581" y="340"/> <operator activated="true" class="order_attributes" compatibility="8.2.001" expanded="true" height="82" name="Reorder Attributes (2)" width="90" x="581" y="238"> <parameter key="sort_mode" value="reference data"/> <parameter key="attribute_ordering" value=""/> <parameter key="use_regular_expressions" value="false"/> <parameter key="handle_unmatched" value="append"/> <parameter key="sort_direction" value="ascending"/> </operator> <operator activated="true" class="cross_distances" compatibility="8.2.001" expanded="true" height="103" name="Cross Distances" width="90" x="715" y="238"> <parameter key="measure_types" value="NumericalMeasures"/> <parameter key="mixed_measure" value="MixedEuclideanDistance"/> <parameter key="nominal_measure" value="SimpleMatchingSimilarity"/> <parameter key="numerical_measure" value="CosineSimilarity"/> <parameter key="divergence" value="GeneralizedIDivergence"/> <parameter key="kernel_type" value="radial"/> <parameter key="kernel_gamma" value="1.0"/> <parameter key="kernel_sigma1" value="1.0"/> <parameter key="kernel_sigma2" value="0.0"/> <parameter key="kernel_sigma3" value="2.0"/> <parameter key="kernel_degree" value="3.0"/> <parameter key="kernel_shift" value="1.0"/> <parameter key="kernel_a" value="1.0"/> <parameter key="kernel_b" value="0.0"/> <parameter key="only_top_k" value="false"/> <parameter key="k" value="10"/> <parameter key="search_for" value="nearest"/> <parameter key="compute_similarities" value="true"/> </operator> <operator activated="false" class="data_to_similarity" compatibility="8.2.001" expanded="true" height="82" name="Data to Similarity" width="90" x="648" y="34"> <parameter key="measure_types" value="NumericalMeasures"/> <parameter key="mixed_measure" value="MixedEuclideanDistance"/> <parameter key="nominal_measure" value="NominalDistance"/> <parameter key="numerical_measure" value="CosineSimilarity"/> <parameter key="divergence" value="GeneralizedIDivergence"/> <parameter key="kernel_type" value="radial"/> <parameter key="kernel_gamma" value="1.0"/> <parameter key="kernel_sigma1" value="1.0"/> <parameter key="kernel_sigma2" value="0.0"/> <parameter key="kernel_sigma3" value="2.0"/> <parameter key="kernel_degree" value="3.0"/> <parameter key="kernel_shift" value="1.0"/> <parameter key="kernel_a" value="1.0"/> <parameter key="kernel_b" value="0.0"/> </operator> <connect from_op="Read Excel" from_port="output" to_op="Remove Duplicates" to_port="example set input"/> <connect from_op="Remove Duplicates" from_port="example set output" to_op="Set Role" to_port="example set input"/> <connect from_op="Set Role" from_port="example set output" to_op="Nominal to Text" to_port="example set input"/> <connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/> <connect from_op="Process Documents from Data" from_port="example set" to_op="Reorder Attributes" to_port="example set input"/> <connect from_op="Read Excel (2)" from_port="output" to_op="Remove Duplicates (2)" to_port="example set input"/> <connect from_op="Remove Duplicates (2)" from_port="example set output" to_op="Set Role (2)" to_port="example set input"/> <connect from_op="Set Role (2)" from_port="example set output" to_op="Nominal to Text (2)" to_port="example set input"/> <connect from_op="Nominal to Text (2)" from_port="example set output" to_op="Process Documents from Data (2)" to_port="example set"/> <connect from_op="Process Documents from Data (2)" from_port="example set" to_op="Multiply" to_port="input"/> <connect from_op="Multiply" from_port="output 1" to_op="Reorder Attributes (2)" to_port="example set input"/> <connect from_op="Multiply" from_port="output 2" to_op="Reorder Attributes" to_port="reference_data"/> <connect from_op="Reorder Attributes" from_port="example set output" to_op="Multiply (2)" to_port="input"/> <connect from_op="Multiply (2)" from_port="output 1" to_op="Reorder Attributes (2)" to_port="reference_data"/> <connect from_op="Multiply (2)" from_port="output 2" to_op="Cross Distances" to_port="reference set"/> <connect from_op="Reorder Attributes (2)" from_port="example set output" to_op="Cross Distances" to_port="request set"/> <connect from_op="Cross Distances" from_port="result set" to_port="result 1"/> <portSpacing port="source_input 1" spacing="0"/> <portSpacing port="sink_result 1" spacing="0"/> <portSpacing port="sink_result 2" spacing="0"/> <description align="center" color="gray" colored="true" height="163" resized="true" width="142" x="28" y="320">Read Requirements Document</description> <description align="center" color="gray" colored="true" height="147" resized="true" width="126" x="30" y="14">Read Requirements Change Requests</description> </process> </operator> </process>
Process Overview:
the most likely reason for that what be, that you have infinite values somewhere in your data. For them the cosine distance is not defined, thus you get a "?" aka missing value in your result.
If this is the case you can use the Replace Infinite Values operator before the calculation.

Creating a pareto chart in RapidMiner

I am not able to plot a simple pareto chart.
My data looks like:
and when I try to create a pareto chart, I get a blank space, I also cannot select a value for "Count Value":
What am I missing here?
My sample data is stored in that xml:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="generate_data_user_specification" compatibility="5.3.015" expanded="true" height="60" name="Generate Data by User Specification" width="90" x="447" y="75">
<list key="attribute_values">
<parameter key="category" value=""black""/>
<parameter key="Incidents" value="10"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="5.3.015" expanded="true" height="60" name="Generate Data by User Specification (2)" width="90" x="447" y="390">
<list key="attribute_values">
<parameter key="category" value=""blue""/>
<parameter key="Incidents" value="2"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="5.3.015" expanded="true" height="60" name="Generate Data by User Specification (3)" width="90" x="447" y="210">
<list key="attribute_values">
<parameter key="category" value=""green""/>
<parameter key="Incidents" value="7"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="5.3.015" expanded="true" height="60" name="Generate Data by User Specification (4)" width="90" x="447" y="165">
<list key="attribute_values">
<parameter key="category" value=""white""/>
<parameter key="Incidents" value="8"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="5.3.015" expanded="true" height="60" name="Generate Data by User Specification (5)" width="90" x="447" y="300">
<list key="attribute_values">
<parameter key="category" value=""red""/>
<parameter key="Incidents" value="2"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="5.3.015" expanded="true" height="60" name="Generate Data by User Specification (6)" width="90" x="447" y="480">
<list key="attribute_values">
<parameter key="category" value=""Yellow""/>
<parameter key="Incidents" value="1"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="5.3.015" expanded="true" height="60" name="Generate Data by User Specification (7)" width="90" x="447" y="705">
<list key="attribute_values">
<parameter key="category" value=""Gray""/>
<parameter key="Incidents" value="1"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="5.3.015" expanded="true" height="60" name="Generate Data by User Specification (8)" width="90" x="447" y="840">
<list key="attribute_values">
<parameter key="category" value=""Navy""/>
<parameter key="Incidents" value="1"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="5.3.015" expanded="true" height="60" name="Generate Data by User Specification (9)" width="90" x="447" y="570">
<list key="attribute_values">
<parameter key="category" value=""Purple""/>
<parameter key="Incidents" value="1"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="append" compatibility="5.3.015" expanded="true" height="220" name="Append" width="90" x="715" y="120"/>
<connect from_op="Generate Data by User Specification" from_port="output" to_op="Append" to_port="example set 1"/>
<connect from_op="Generate Data by User Specification (2)" from_port="output" to_op="Append" to_port="example set 5"/>
<connect from_op="Generate Data by User Specification (3)" from_port="output" to_op="Append" to_port="example set 4"/>
<connect from_op="Generate Data by User Specification (4)" from_port="output" to_op="Append" to_port="example set 2"/>
<connect from_op="Generate Data by User Specification (5)" from_port="output" to_op="Append" to_port="example set 3"/>
<connect from_op="Generate Data by User Specification (6)" from_port="output" to_op="Append" to_port="example set 6"/>
<connect from_op="Generate Data by User Specification (7)" from_port="output" to_op="Append" to_port="example set 9"/>
<connect from_op="Generate Data by User Specification (8)" from_port="output" to_op="Append" to_port="example set 7"/>
<connect from_op="Generate Data by User Specification (9)" from_port="output" to_op="Append" to_port="example set 8"/>
<connect from_op="Append" from_port="merged set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
So I found a workaround (thanks to Andrew), which is only working for this example set.
I had to "de-aggregate" it and add a new polynominal attribute with the same value for every example.
Then I could create a Pareto chart, group-by 'category' and set the count-column to the new attribute.
Lead to this chart:
When I do this with my dataset I get this chart:
I guess without being able to configure the pareto chart, it is really bad for a lot of different values in the group-by category.

Export Distribution Model in RapidMiner

I have an example set in rapid miner.It has 2 columns.
for Example
colA colB
a 1
a 2
b 3
b 2
=====
I have used naive Bayes. It gives probability for each of colB for colA in distribution table.
for example, P(2) = .5
I need that distribution table output.
write model, excel csv, write does not help.
What should I do ?
Thanks in advance.
The simplest solution would just mark the table with you mouse (Strg+A works as well) and use copy and paste.
Unfortunately this only works manually, if you have to export the data very often, the next best step would be to write your own operator for it (which is actually quite simple and requires only basic Java skills):
http://docs.rapidminer.com/developers/
Yes you can. If you install the Reporting extension from the marketplace (it's free) then you can export the distribution table, plot view or text view.
Here's a sample process.
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="7.0.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="6.0.002" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="reporting:generate_report" compatibility="5.3.000" expanded="true" height="68" name="Generate Report" width="90" x="45" y="34">
<parameter key="report_name" value="myReport"/>
</operator>
<operator activated="true" class="retrieve" compatibility="7.0.000" expanded="true" height="68" name="Golf" width="90" x="112" y="85">
<parameter key="repository_entry" value="//Samples/data/Golf"/>
</operator>
<operator activated="true" class="retrieve" compatibility="7.0.000" expanded="true" height="68" name="Golf-Testset" width="90" x="179" y="210">
<parameter key="repository_entry" value="//Samples/data/Golf-Testset"/>
</operator>
<operator activated="true" class="naive_bayes" compatibility="7.0.000" expanded="true" height="82" name="Naive Bayes" width="90" x="246" y="34"/>
<operator activated="true" class="reporting:report" compatibility="5.3.000" expanded="true" height="68" name="Report" width="90" x="380" y="34">
<parameter key="report_name" value="myReport"/>
<parameter key="report_item_header" value="Distribution Table"/>
<parameter key="specified" value="true"/>
<parameter key="reportable_type" value="Distribution Model"/>
<parameter key="renderer_name" value="Distribution Table"/>
<list key="parameters">
<parameter key="min_row" value="1"/>
<parameter key="max_row" value="2147483647"/>
<parameter key="min_column" value="1"/>
<parameter key="max_column" value="2147483647"/>
<parameter key="sort_column" value="2147483647"/>
<parameter key="sort_decreasing" value="false"/>
</list>
</operator>
<operator activated="true" class="apply_model" compatibility="7.0.000" expanded="true" height="82" name="Apply Model" width="90" x="514" y="120">
<list key="application_parameters"/>
</operator>
<connect from_op="Golf" from_port="output" to_op="Naive Bayes" to_port="training set"/>
<connect from_op="Golf-Testset" from_port="output" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Naive Bayes" from_port="model" to_op="Report" to_port="reportable in"/>
<connect from_op="Report" from_port="reportable out" to_op="Apply Model" to_port="model"/>
<connect from_op="Apply Model" from_port="labelled data" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="90"/>
<portSpacing port="sink_result 2" spacing="18"/>
</process>
</operator>
</process>

2 times the data results from the operater "Process documents from data" in Rapidminer

I'm new with rapidminer and tried to read rss feeds and put it in some files like this tutorial on vimeo (https://vimeo.com/62963128). Almost everything works, but I have one problem. The process generates every file twice. File 1 and 21 are the same file, file 2 and 22 are the same file, and so on. Does anybody knows what I'm doing wrong?
This is the code:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="6.5.002">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="6.5.002" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="web:read_rss" compatibility="6.5.000" expanded="true" height="60" name="Read RSS Feed" width="90" x="112" y="255">
<parameter key="url" value="http://www.autoblog.com/category/recap/rss.xml"/>
<parameter key="random_user_agent" value="true"/>
<parameter key="connection_timeout" value="10000"/>
<parameter key="read_timeout" value="10000"/>
</operator>
<operator activated="true" class="web:retrieve_webpages" compatibility="6.5.000" expanded="true" height="60" name="Get Pages" width="90" x="514" y="435">
<parameter key="link_attribute" value="Link"/>
<parameter key="page_attribute" value="myhtml"/>
<parameter key="random_user_agent" value="false"/>
<parameter key="connection_timeout" value="10000"/>
<parameter key="read_timeout" value="10000"/>
<parameter key="follow_redirects" value="true"/>
<parameter key="accept_cookies" value="none"/>
<parameter key="cookie_scope" value="global"/>
<parameter key="request_method" value="GET"/>
<parameter key="delay" value="none"/>
<parameter key="delay_amount" value="1000"/>
<parameter key="min_delay_amount" value="0"/>
<parameter key="max_delay_amount" value="1000"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="6.5.000" expanded="true" height="76" name="Process Documents from Data" width="90" x="715" y="345">
<parameter key="create_word_vector" value="true"/>
<parameter key="vector_creation" value="TF-IDF"/>
<parameter key="add_meta_information" value="true"/>
<parameter key="keep_text" value="false"/>
<parameter key="prune_method" value="none"/>
<parameter key="prune_below_percent" value="3.0"/>
<parameter key="prune_above_percent" value="30.0"/>
<parameter key="prune_below_rank" value="0.05"/>
<parameter key="prune_above_rank" value="0.95"/>
<parameter key="datamanagement" value="double_sparse_array"/>
<parameter key="select_attributes_and_weights" value="false"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="web:extract_html_text_content" compatibility="6.5.000" expanded="true" height="60" name="Extract Content" width="90" x="112" y="30">
<parameter key="extract_content" value="true"/>
<parameter key="minimum_text_block_length" value="5"/>
<parameter key="override_content_type_information" value="true"/>
<parameter key="neglegt_span_tags" value="true"/>
<parameter key="neglect_p_tags" value="true"/>
<parameter key="neglect_b_tags" value="true"/>
<parameter key="neglect_i_tags" value="true"/>
<parameter key="neglect_br_tags" value="true"/>
<parameter key="ignore_non_html_tags" value="true"/>
</operator>
<operator activated="true" class="web:unescape_html" compatibility="6.5.000" expanded="true" height="60" name="Unescape HTML Document" width="90" x="313" y="30"/>
<operator activated="true" class="text:write_document" compatibility="6.5.000" expanded="true" height="76" name="Write Document" width="90" x="514" y="30">
<parameter key="overwrite" value="true"/>
<parameter key="encoding" value="SYSTEM"/>
</operator>
<operator activated="true" class="write_file" compatibility="6.5.002" expanded="true" height="60" name="Write File" width="90" x="715" y="165">
<parameter key="resource_type" value="file"/>
<parameter key="filename" value="d:\test\%{t}-%{a}.txt"/>
<parameter key="mime_type" value="application/octet-stream"/>
</operator>
<connect from_port="document" to_op="Extract Content" to_port="document"/>
<connect from_op="Extract Content" from_port="document" to_op="Unescape HTML Document" to_port="document"/>
<connect from_op="Unescape HTML Document" from_port="document" to_op="Write Document" to_port="document"/>
<connect from_op="Write Document" from_port="document" to_port="document 1"/>
<connect from_op="Write Document" from_port="file" to_op="Write File" to_port="file"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_op="Read RSS Feed" from_port="output" to_op="Get Pages" to_port="Example Set"/>
<connect from_op="Get Pages" from_port="Example Set" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
Thanks in advance.
Regards,
Dave
If you turn off create word vector for the Process Documents operator, you should find that you get 20 files. I can't explain it.