Generate ExampleSet with zeros in RapidMiner - rapidminer

What is the easiest/correct way to generate an ExampleSet in RapidMiner that looks like this:
The way I am using now:
Select Attributes was necessary because ‘Generate Data’ gave me a ‘label’ attribute which I don’t want

Three operators seems to be the minimum. You could use Generate Data by User Specification combined with Loop and Append. Here's an example...
<?xml version="1.0" encoding="UTF-8"?><process version="7.5.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.5.000" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="concurrency:loop" compatibility="7.5.000" expanded="true" height="82" name="Loop" width="90" x="246" y="136">
<process expanded="true">
<operator activated="true" class="generate_data_user_specification" compatibility="7.5.000" expanded="true" height="68" name="Generate Data by User Specification" width="90" x="179" y="238">
<list key="attribute_values">
<parameter key="attribute1" value="0"/>
<parameter key="anotherattribute" value="0"/>
</list>
<list key="set_additional_roles"/>
</operator>
<connect from_op="Generate Data by User Specification" from_port="output" to_port="output 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="append" compatibility="7.5.000" expanded="true" height="82" name="Append" width="90" x="447" y="136"/>
<connect from_op="Loop" from_port="output 1" to_op="Append" to_port="example set 1"/>
<connect from_op="Append" from_port="merged set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
Andrew

Related

Rapidminer, sort and filter attributes

I am using RapidMiner, and I would like to order my attributes' per example, and then filter the 15th's highest values. Can someone give me a clue on that?
Thanks.
So if I understand you correctly, you have an example set and for each row you want to sort the attributes and keep only the 15 biggest?
So imagine you have this random data set:
You try to get this result for your first example (limited to 5 biggest attributes for simplicity):
This can be done by looping over your example set with Loop Examples, filter each row with Filter Example Range, then Transpose the rows to columns, which then again can be sorted with Sort and limited to the max 15 values using Filter Example Range again.
Please notice that these new examples (rows) cannot be easily represented in a single example set, since they assumingly all have different attributes in a different order, thus you will get a collection of separate, sorted example sets.
Here is an example process:
<?xml version="1.0" encoding="UTF-8"?><process version="7.5.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.5.000" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="generate_data" compatibility="7.5.000" expanded="true" height="68" name="Generate Data" width="90" x="45" y="34">
<parameter key="number_examples" value="10"/>
<parameter key="number_of_attributes" value="10"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="7.5.000" expanded="true" height="82" name="Select Attributes" width="90" x="179" y="34">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="label"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
<description align="center" color="transparent" colored="false" width="126">remove label for clarity</description>
</operator>
<operator activated="true" class="loop_examples" compatibility="7.5.000" expanded="true" height="103" name="Loop Examples" width="90" x="313" y="34">
<process expanded="true">
<operator activated="true" class="filter_example_range" compatibility="7.5.000" expanded="true" height="82" name="Filter Example Range" width="90" x="45" y="34">
<parameter key="first_example" value="%{example}"/>
<parameter key="last_example" value="%{example}"/>
<description align="center" color="transparent" colored="false" width="126">only work on current example</description>
</operator>
<operator activated="true" class="transpose" compatibility="7.5.000" expanded="true" height="82" name="Transpose" width="90" x="179" y="34">
<description align="center" color="transparent" colored="false" width="126">transpose the row to columns with attribute name and attribute value</description>
</operator>
<operator activated="true" class="rename" compatibility="7.5.000" expanded="true" height="82" name="Rename" width="90" x="313" y="34">
<parameter key="old_name" value="id"/>
<parameter key="new_name" value="Example%{example}AttributeName"/>
<list key="rename_additional_attributes">
<parameter key="att_1" value="Value"/>
</list>
<description align="center" color="transparent" colored="false" width="126">rename for clarity, include original &quot;row&quot; number in attribute name</description>
</operator>
<operator activated="true" class="sort" compatibility="7.5.000" expanded="true" height="82" name="Sort" width="90" x="447" y="34">
<parameter key="attribute_name" value="Value"/>
<parameter key="sorting_direction" value="decreasing"/>
<description align="center" color="transparent" colored="false" width="126">sort decreasing by attribute value</description>
</operator>
<operator activated="true" class="filter_example_range" compatibility="7.5.000" expanded="true" height="82" name="Filter Example Range (2)" width="90" x="581" y="34">
<parameter key="first_example" value="1"/>
<parameter key="last_example" value="5"/>
<description align="center" color="transparent" colored="false" width="126">keep 5 attributes with biggest values</description>
</operator>
<connect from_port="example set" to_op="Filter Example Range" to_port="example set input"/>
<connect from_op="Filter Example Range" from_port="example set output" to_op="Transpose" to_port="example set input"/>
<connect from_op="Transpose" from_port="example set output" to_op="Rename" to_port="example set input"/>
<connect from_op="Rename" from_port="example set output" to_op="Sort" to_port="example set input"/>
<connect from_op="Sort" from_port="example set output" to_op="Filter Example Range (2)" to_port="example set input"/>
<connect from_op="Filter Example Range (2)" from_port="example set output" to_port="output 1"/>
<portSpacing port="source_example set" spacing="0"/>
<portSpacing port="sink_example set" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
<description align="center" color="transparent" colored="false" width="126">Loop over all examples, current index is stored in the macro &quot;example&quot;</description>
</operator>
<connect from_op="Generate Data" from_port="output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Loop Examples" to_port="example set"/>
<connect from_op="Loop Examples" from_port="output 1" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>

How to load transaction (basket) data in RapidMiner for association rule?

I have comma separated transaction (basket) data in itemsets format
citrus fruit,semi-finished,bread,margarine
tropical fruit,yogurt,coffee,milk
yogurt,cream,cheese,meat spreads
etc
where each row indicates the items purchased in a single transaction.
By using Read.CSV operator i loaded this file in RapidMiner. I could not find any operator to transform this data for FP-growth and association rule mining.
Is there any way to read such type of file in RapidMiner for association rule mining?
I finally understood what you meant - sorry I was being slow. This can be done using operators from the Text Processing Extension. You have to install this from the RapidMiner repository. Once you have you can try this process.
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="7.0.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.0.000" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="read_csv" compatibility="7.0.000" expanded="true" height="68" name="Read CSV" width="90" x="246" y="85">
<parameter key="csv_file" value="C:\Temp\is.txt"/>
<parameter key="column_separators" value="\r\n"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations"/>
<parameter key="encoding" value="windows-1252"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="att1.true.polynominal.attribute"/>
</list>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="7.0.000" expanded="true" height="82" name="Nominal to Text" width="90" x="380" y="85"/>
<operator activated="true" class="text:process_document_from_data" compatibility="7.0.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="514" y="85">
<parameter key="vector_creation" value="Term Occurrences"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="7.0.000" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34">
<parameter key="mode" value="specify characters"/>
<parameter key="characters" value=","/>
</operator>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_op="Read CSV" from_port="output" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
The trick is to use Read CSV to read the original file in but use end of line as the delimiter. This reads the entire line in as a polynominal attribute. From there, you have to convert this to text so that the text processing operators can do their work. The Process Documents from Data operator is then used to make the final example set. The important point is to use the Tokenize operator to split the lines into words separated by commas.

Rapidminer - k-means query

Sorry if this is a very novice question to ask But I have recently started exploring Rapidminer. I have used it to cluster my sample data [using K-means clustering]. My query is if I use a excel raw data file to cluster, how will I get my excel data back [output data] split into K clusters in excel file. I know how to create cluster and switch between the Design and Results screens.
Thanks in advance.
Hi and welcome to stackoverflow and RapidMiner.
If I understand your question correctly, you read your data from excel, make a clustering and then want to write the single clusters back to excel.
If you want to do it manually you can use the "Filter Examples" Operator and filter for the specific cluster.
You can also do it automatically with the "Loop Values" Operator, where you set the loop attribute to cluster and use the iteration macro inside the loop to filter your data. Then you could store your data and use the iteration macro also for the file name.
See the sample process below (you can copy it and paste it in the XML panel directly in RapidMiner):
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="7.0.0">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.1.000-SNAPSHOT" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="generate_data" compatibility="7.1.000-SNAPSHOT" expanded="true" height="68" name="Generate Data" width="90" x="112" y="34"/>
<operator activated="true" class="generate_id" compatibility="7.1.000-SNAPSHOT" expanded="true" height="82" name="Generate ID" width="90" x="246" y="34"/>
<operator activated="true" class="k_means" compatibility="7.1.000-SNAPSHOT" expanded="true" height="82" name="Clustering" width="90" x="447" y="34">
<parameter key="k" value="5"/>
</operator>
<operator activated="true" class="loop_values" compatibility="7.1.000-SNAPSHOT" expanded="true" height="82" name="Loop Values" width="90" x="715" y="34">
<parameter key="attribute" value="cluster"/>
<process expanded="true">
<operator activated="true" breakpoints="after" class="filter_examples" compatibility="7.1.000-SNAPSHOT" expanded="true" height="103" name="Filter Examples" width="90" x="179" y="34">
<list key="filters_list">
<parameter key="filters_entry_key" value="cluster.equals.%{loop_value}"/>
</list>
</operator>
<connect from_port="example set" to_op="Filter Examples" to_port="example set input"/>
<connect from_op="Filter Examples" from_port="example set output" to_port="out 1"/>
<portSpacing port="source_example set" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
</process>
</operator>
<connect from_op="Generate Data" from_port="output" to_op="Generate ID" to_port="example set input"/>
<connect from_op="Generate ID" from_port="example set output" to_op="Clustering" to_port="example set"/>
<connect from_op="Clustering" from_port="clustered set" to_op="Loop Values" to_port="example set"/>
<connect from_op="Loop Values" from_port="out 1" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>

Build Correlation Matrix to use two different cells of two Csv files in Rapid miner

I want to build correlation matrix with two different cells of two different csv files. Anyone can help me to tell how I can specify one column from one file and same as other file?.
You have to create a new example set by joining the two columns together with the Join operator and then you can calculate the correlation matrix. Make sure when joining that the two example sets have the same ID attribute.
The code block below shows an example process of how to select and join two attributes.
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="6.4.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.0.000-SNAPSHOT" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="7.0.000-SNAPSHOT" expanded="true" height="60" name="Retrieve Iris" width="90" x="45" y="75">
<parameter key="repository_entry" value="//Samples/data/Iris"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="7.0.000-SNAPSHOT" expanded="true" height="76" name="Select Attributes" width="90" x="179" y="75">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="a1"/>
</operator>
<operator activated="true" class="retrieve" compatibility="7.0.000-SNAPSHOT" expanded="true" height="60" name="Retrieve Iris (2)" width="90" x="45" y="255">
<parameter key="repository_entry" value="//Samples/data/Iris"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="7.0.000-SNAPSHOT" expanded="true" height="76" name="Select Attributes (2)" width="90" x="179" y="255">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="a2"/>
</operator>
<operator activated="true" class="join" compatibility="7.0.000-SNAPSHOT" expanded="true" height="76" name="Join" width="90" x="380" y="165">
<list key="key_attributes"/>
</operator>
<operator activated="true" class="correlation_matrix" compatibility="7.0.000-SNAPSHOT" expanded="true" height="94" name="Correlation Matrix" width="90" x="581" y="165"/>
<connect from_op="Retrieve Iris" from_port="output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Join" to_port="left"/>
<connect from_op="Retrieve Iris (2)" from_port="output" to_op="Select Attributes (2)" to_port="example set input"/>
<connect from_op="Select Attributes (2)" from_port="example set output" to_op="Join" to_port="right"/>
<connect from_op="Join" from_port="join" to_op="Correlation Matrix" to_port="example set"/>
<connect from_op="Correlation Matrix" from_port="matrix" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>

RapidMiner - process data table rows as documents

I need help about the following question: i would process each row from a data table (Example set structure (label[datatype]): mid[int], body[text]), obtained by Read Database, as a document. In this way i can apply some filters to each document (stop-words, filters, and so on). Can anyone help me?
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="read_database" compatibility="5.3.015" expanded="true" height="60" name="Server Connection" width="90" x="112" y="120">
<parameter key="connection" value="Server"/>
<parameter key="query" value="SELECT `mid`, `body` FROM `message` WHERE `mid` < 10 ORDER BY `mid`"/>
<enumeration key="parameters"/>
</operator>
<operator activated="true" class="loop_data_sets" compatibility="5.3.015" expanded="true" height="76" name="Loop Data Sets" width="90" x="111" y="210">
<process expanded="true">
<operator activated="true" class="text:extract_document" compatibility="5.3.002" expanded="true" name="Extract Document (2)">
<parameter key="attribute_name" value="body"/>
<parameter key="example_index" value="1"/>
</operator>
<operator activated="true" class="text:transform_cases" compatibility="5.3.002" expanded="true" name="Transform Cases (2)"/>
<operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" name="Tokenize (2)"/>
<operator activated="true" class="text:filter_stopwords_dictionary" compatibility="5.3.002" expanded="true" name="Filter Stopwords (2)">
<parameter key="file" value="C:\User\stopwords.txt"/>
</operator>
<operator activated="true" class="text:filter_by_length" compatibility="5.3.002" expanded="true" name="Filter Tokens (2)">
<parameter key="min_chars" value="2"/>
</operator>
<connect from_port="example set" to_op="Extract Document (2)" to_port="example set"/>
<connect from_op="Extract Document (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
<connect from_op="Transform Cases (2)" from_port="document" to_op="Tokenize (2)" to_port="document"/>
<connect from_op="Tokenize (2)" from_port="document" to_op="Filter Stopwords (2)" to_port="document"/>
<connect from_op="Filter Stopwords (2)" from_port="document" to_op="Filter Tokens (2)" to_port="document"/>
<connect from_op="Filter Tokens (2)" from_port="document" to_port="output 1"/>
<portSpacing port="source_example set" spacing="0"/>
<portSpacing port="sink_performance" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
<connect from_op="Server Connection" from_port="output" to_op="Loop Data Sets" to_port="example set 1"/>
<connect from_op="Loop Data Sets" from_port="output 1" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
I've finally found a solution:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="read_database" compatibility="5.3.015" expanded="true" height="60" name="Server Connection" width="90" x="45" y="75">
<parameter key="connection" value="Server"/>
<parameter key="query" value="SELECT `mid`, `body` FROM `message` WHERE `mid` < 10 ORDER BY `mid`"/>
<enumeration key="parameters"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="5.3.002" expanded="true" height="76" name="Process Documents from Data (2)" width="90" x="45" y="165">
<parameter key="create_word_vector" value="false"/>
<parameter key="add_meta_information" value="false"/>
<parameter key="keep_text" value="true"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="web:extract_html_text_content" compatibility="5.3.001" expanded="true" height="60" name="Extract Content" width="90" x="45" y="30">
<parameter key="minimum_text_block_length" value="3"/>
</operator>
<operator activated="true" class="text:transform_cases" compatibility="5.3.002" expanded="true" height="60" name="Transform Cases" width="90" x="45" y="120"/>
<operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize (3)" width="90" x="45" y="210"/>
<operator activated="true" class="text:filter_stopwords_dictionary" compatibility="5.3.002" expanded="true" height="76" name="Filter Stopwords (3)" width="90" x="45" y="300">
<parameter key="file" value="C:\User\stopwords.txt"/>
</operator>
<operator activated="true" class="text:filter_by_length" compatibility="5.3.002" expanded="true" height="60" name="Filter Tokens (3)" width="90" x="45" y="390">
<parameter key="min_chars" value="3"/>
</operator>
<operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Delete links" width="90" x="45" y="480">
<parameter key="mode" value="regular expression"/>
<parameter key="expression" value="(http[s]?://[a-z_\.A-Z0-9\-]*)"/>
</operator>
<connect from_port="document" to_op="Extract Content" to_port="document"/>
<connect from_op="Extract Content" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Tokenize (3)" to_port="document"/>
<connect from_op="Tokenize (3)" from_port="document" to_op="Filter Stopwords (3)" to_port="document"/>
<connect from_op="Filter Stopwords (3)" from_port="document" to_op="Filter Tokens (3)" to_port="document"/>
<connect from_op="Filter Tokens (3)" from_port="document" to_op="Delete links" to_port="document"/>
<connect from_op="Delete links" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="generate_id" compatibility="5.3.015" expanded="true" height="76" name="Generate ID" width="90" x="179" y="165"/>
<connect from_op="Server Connection" from_port="output" to_op="Process Documents from Data (2)" to_port="example set"/>
<connect from_op="Process Documents from Data (2)" from_port="example set" to_op="Generate ID" to_port="example set input"/>
<connect from_op="Generate ID" from_port="example set output" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>