RapidMiner - Generate Association rules for each date - rapidminer

I'm trying to find out association rules in my Market Basket Analysis by applying FP-Growth.
My concern is to find association rules by Date, means finding out item associations on daily basis for up to a year.
I can design to get associations for couple of days but it's time consuming to design it for 356 days. Dataset is as follows.
I have used the Market basket analysis template given in Rapidminer.
How can I achieve this by couple of step rather than executing for each day up to a year?
Thanks
<?xml version="1.0" encoding="UTF-8"?><process version="9.0.002">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="9.0.002" expanded="true" name="Process" origin="GENERATED_TEMPLATE">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="9.0.002" expanded="true" height="68" name="Retrieve Clustered Data with Items" width="90" x="45" y="187">
<parameter key="repository_entry" value="Clustered Data with Items"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="9.0.002" expanded="true" height="103" name="Filter Examples" width="90" x="313" y="187">
<list key="filters_list">
<parameter key="filters_entry_key" value="ReceiptDate.eq.01/05/2017"/>
</list>
</operator>
<operator activated="true" class="aggregate" compatibility="6.0.006" expanded="true" height="82" name="Aggregate" origin="GENERATED_TEMPLATE" width="90" x="112" y="336">
<list key="aggregation_attributes">
<parameter key="Orders" value="sum"/>
</list>
<parameter key="group_by_attributes" value="Invoice|product 1"/>
</operator>
<operator activated="true" class="pivot" compatibility="9.0.002" expanded="true" height="82" name="Pivot" origin="GENERATED_TEMPLATE" width="90" x="246" y="336">
<parameter key="group_attribute" value="Invoice"/>
<parameter key="index_attribute" value="product 1"/>
</operator>
<operator activated="true" class="rename_by_replacing" compatibility="9.0.002" expanded="true" height="82" name="Rename by Replacing" origin="GENERATED_TEMPLATE" width="90" x="380" y="336">
<parameter key="attribute" value="Invoice"/>
<parameter key="replace_what" value="sum\(Orders\)_"/>
</operator>
<operator activated="true" class="replace_missing_values" compatibility="9.0.002" expanded="true" height="103" name="Replace Missing Values" origin="GENERATED_TEMPLATE" width="90" x="112" y="442">
<parameter key="default" value="zero"/>
<list key="columns"/>
</operator>
<operator activated="true" class="numerical_to_binominal" compatibility="6.0.003" expanded="true" height="82" name="Numerical to Binominal" origin="GENERATED_TEMPLATE" width="90" x="246" y="442"/>
<operator activated="true" class="set_role" compatibility="9.0.002" expanded="true" height="82" name="Set Role" origin="GENERATED_TEMPLATE" width="90" x="380" y="442">
<parameter key="attribute_name" value="Invoice"/>
<parameter key="target_role" value="id"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="concurrency:fp_growth" compatibility="9.0.002" expanded="true" height="82" name="FP-Growth" origin="GENERATED_TEMPLATE" width="90" x="648" y="289">
<parameter key="positive_value" value="true"/>
<parameter key="min_support" value="0.005"/>
<parameter key="find_min_number_of_itemsets" value="false"/>
<enumeration key="must_contain_list"/>
</operator>
<operator activated="true" class="create_association_rules" compatibility="9.0.002" expanded="true" height="82" name="Create Association Rules" origin="GENERATED_TEMPLATE" width="90" x="648" y="442">
<parameter key="min_confidence" value="0.1"/>
</operator>
<connect from_op="Retrieve Clustered Data with Items" from_port="output" to_op="Filter Examples" to_port="example set input"/>
<connect from_op="Filter Examples" from_port="example set output" to_op="Aggregate" to_port="example set input"/>
<connect from_op="Aggregate" from_port="example set output" to_op="Pivot" to_port="example set input"/>
<connect from_op="Pivot" from_port="example set output" to_op="Rename by Replacing" to_port="example set input"/>
<connect from_op="Rename by Replacing" from_port="example set output" to_op="Replace Missing Values" to_port="example set input"/>
<connect from_op="Replace Missing Values" from_port="example set output" to_op="Numerical to Binominal" to_port="example set input"/>
<connect from_op="Numerical to Binominal" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="FP-Growth" to_port="example set"/>
<connect from_op="FP-Growth" from_port="frequent sets" to_op="Create Association Rules" to_port="item sets"/>
<connect from_op="Create Association Rules" from_port="rules" to_port="result 1"/>
<connect from_op="Create Association Rules" from_port="item sets" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="147"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="42"/>
<description align="left" color="yellow" colored="false" height="70" resized="false" width="850" x="20" y="25">MARKET BASKET ANALYSIS<br>Model associations between products by determining sets of items frequently purchased together and building association rules to derive recommendations.</description>
<description align="left" color="blue" colored="true" height="185" resized="true" width="550" x="20" y="105">Step 1:<br/>Load transaction data containing a transaction id, a product id and a quantifier. The data denotes how many times a certain product has been purchased as part of a transactions.</description>
<description align="left" color="purple" colored="true" height="341" resized="true" width="549" x="20" y="300"><br> <br> <br> <br> <br> <br> <br> <br> <br> <br> <br> <br> <br> Step 2:<br>Edit, transform &amp; load (ETL) - Aggregate transaction data to account for multiple occurrences of the same product in a transaction. Pivot the data so that each transaction is represented by a row. Transform purchase amounts to binary &quot;product purchased yes/no &quot; indicators.<br></description>
<description align="left" color="green" colored="true" height="310" resized="true" width="290" x="580" y="105">Step 3:<br/>Using FP-Growth, determine frequent item sets. A frequent item sets denotes that the items (products) in the set have been purchased together frequently, i.e. in a certain ratio of transactions. This ratio is given by the support of the item set.</description>
<description align="left" color="green" colored="true" height="215" resized="true" width="286" x="579" y="425"><br> <br> <br> <br> <br> <br> Step 4:<br/>Create association rules which can be used for product recommendations depending on the confidences of the rules.<br></description>
<description align="left" color="yellow" colored="false" height="35" resized="true" width="849" x="20" y="655">Outputs: association rules, frequent item set<br></description>
</process>
</operator>
</process>
Sample Data

After retrieving your data as an example set, you can then use the Loop Values operator to loop over the ReceiptDate attribute. The current value (in your case, the date) is stored in the loop_value macro.
You then put the whole process of building your association rules inside the subprocess and change your Filter Examples operator to condition class expression with ReceiptDate==%{loop_value} as the parameter expression.
This will filter your whole dataset, so you only keep examples of the current date, then build your model on that subset. As a result, you get a collection of different models on the out-port of Loop Values.
If you frequently build different models based on one (or more) parameter, it might be interesting to have a look at the Jackhammer extension by Old World Computing - the Indexed Model operator does exactly this for you (build specific models for different parameter values). It makes using these specific models a no-brainer, since you get one single model that you can then apply on your data - the model matching the parameters is automatically selected and applied.

Related

Calculate Percentage Values

My Rapidminer process results are published as follows
Row No. Count
1 9.0
2 11.0
3 32.0
If I want to calculate:
(9/32)*100 and
(11/32)*100
from this result set, how would I do it?
the solution is not quite straight forward, as RapidMiner normally treats Examples (rows) as independent of each other.
What you can do is to extract the value needed as a macro and use it in the Generate Attributes Operator.
See the attached sample process for a solution to your particular problem. Just copy and paste the XML below to your process window in RapidMiner.
Also feel free to ask further, or re-post, questions in the RapidMiner community forum.
<?xml version="1.0" encoding="UTF-8"?><process version="7.6.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.6.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification" width="90" x="112" y="85">
<list key="attribute_values">
<parameter key="Count" value="9"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (2)" width="90" x="112" y="187">
<list key="attribute_values">
<parameter key="Count" value="11"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (3)" width="90" x="112" y="340">
<list key="attribute_values">
<parameter key="Count" value="32"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="append" compatibility="7.6.001" expanded="true" height="124" name="Append" width="90" x="380" y="187"/>
<operator activated="true" class="extract_macro" compatibility="7.6.001" expanded="true" height="68" name="Extract Macro" width="90" x="581" y="187">
<parameter key="macro" value="divisor"/>
<parameter key="macro_type" value="data_value"/>
<parameter key="attribute_name" value="Count"/>
<parameter key="example_index" value="3"/>
<list key="additional_macros"/>
<description align="center" color="green" colored="true" width="126">Extracting the third value as a macro. It can be the called using the %{macro_name} syntax</description>
</operator>
<operator activated="true" class="generate_attributes" compatibility="7.6.001" expanded="true" height="82" name="Generate Attributes" width="90" x="782" y="187">
<list key="function_descriptions">
<parameter key="Percentage" value="5"/>
</list>
<description align="center" color="green" colored="true" width="126">Creating a new Attribute (column) with the desired calculation<br><br>Check the final paragraph of the help text for the &quot;Generate Attribute&quot; Operator for a description of how to work with macros</description>
</operator>
<connect from_op="Generate Data by User Specification" from_port="output" to_op="Append" to_port="example set 1"/>
<connect from_op="Generate Data by User Specification (2)" from_port="output" to_op="Append" to_port="example set 2"/>
<connect from_op="Generate Data by User Specification (3)" from_port="output" to_op="Append" to_port="example set 3"/>
<connect from_op="Append" from_port="merged set" to_op="Extract Macro" to_port="example set"/>
<connect from_op="Extract Macro" from_port="example set" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<description align="center" color="yellow" colored="false" height="581" resized="true" width="444" x="56" y="18">Generating sample data to fit the original problem</description>
</process>
</operator>
</process>

Rapidminer, sort and filter attributes

I am using RapidMiner, and I would like to order my attributes' per example, and then filter the 15th's highest values. Can someone give me a clue on that?
Thanks.
So if I understand you correctly, you have an example set and for each row you want to sort the attributes and keep only the 15 biggest?
So imagine you have this random data set:
You try to get this result for your first example (limited to 5 biggest attributes for simplicity):
This can be done by looping over your example set with Loop Examples, filter each row with Filter Example Range, then Transpose the rows to columns, which then again can be sorted with Sort and limited to the max 15 values using Filter Example Range again.
Please notice that these new examples (rows) cannot be easily represented in a single example set, since they assumingly all have different attributes in a different order, thus you will get a collection of separate, sorted example sets.
Here is an example process:
<?xml version="1.0" encoding="UTF-8"?><process version="7.5.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.5.000" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="generate_data" compatibility="7.5.000" expanded="true" height="68" name="Generate Data" width="90" x="45" y="34">
<parameter key="number_examples" value="10"/>
<parameter key="number_of_attributes" value="10"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="7.5.000" expanded="true" height="82" name="Select Attributes" width="90" x="179" y="34">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="label"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
<description align="center" color="transparent" colored="false" width="126">remove label for clarity</description>
</operator>
<operator activated="true" class="loop_examples" compatibility="7.5.000" expanded="true" height="103" name="Loop Examples" width="90" x="313" y="34">
<process expanded="true">
<operator activated="true" class="filter_example_range" compatibility="7.5.000" expanded="true" height="82" name="Filter Example Range" width="90" x="45" y="34">
<parameter key="first_example" value="%{example}"/>
<parameter key="last_example" value="%{example}"/>
<description align="center" color="transparent" colored="false" width="126">only work on current example</description>
</operator>
<operator activated="true" class="transpose" compatibility="7.5.000" expanded="true" height="82" name="Transpose" width="90" x="179" y="34">
<description align="center" color="transparent" colored="false" width="126">transpose the row to columns with attribute name and attribute value</description>
</operator>
<operator activated="true" class="rename" compatibility="7.5.000" expanded="true" height="82" name="Rename" width="90" x="313" y="34">
<parameter key="old_name" value="id"/>
<parameter key="new_name" value="Example%{example}AttributeName"/>
<list key="rename_additional_attributes">
<parameter key="att_1" value="Value"/>
</list>
<description align="center" color="transparent" colored="false" width="126">rename for clarity, include original &quot;row&quot; number in attribute name</description>
</operator>
<operator activated="true" class="sort" compatibility="7.5.000" expanded="true" height="82" name="Sort" width="90" x="447" y="34">
<parameter key="attribute_name" value="Value"/>
<parameter key="sorting_direction" value="decreasing"/>
<description align="center" color="transparent" colored="false" width="126">sort decreasing by attribute value</description>
</operator>
<operator activated="true" class="filter_example_range" compatibility="7.5.000" expanded="true" height="82" name="Filter Example Range (2)" width="90" x="581" y="34">
<parameter key="first_example" value="1"/>
<parameter key="last_example" value="5"/>
<description align="center" color="transparent" colored="false" width="126">keep 5 attributes with biggest values</description>
</operator>
<connect from_port="example set" to_op="Filter Example Range" to_port="example set input"/>
<connect from_op="Filter Example Range" from_port="example set output" to_op="Transpose" to_port="example set input"/>
<connect from_op="Transpose" from_port="example set output" to_op="Rename" to_port="example set input"/>
<connect from_op="Rename" from_port="example set output" to_op="Sort" to_port="example set input"/>
<connect from_op="Sort" from_port="example set output" to_op="Filter Example Range (2)" to_port="example set input"/>
<connect from_op="Filter Example Range (2)" from_port="example set output" to_port="output 1"/>
<portSpacing port="source_example set" spacing="0"/>
<portSpacing port="sink_example set" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
<description align="center" color="transparent" colored="false" width="126">Loop over all examples, current index is stored in the macro &quot;example&quot;</description>
</operator>
<connect from_op="Generate Data" from_port="output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Loop Examples" to_port="example set"/>
<connect from_op="Loop Examples" from_port="output 1" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>

Filtering out 'most recent' example record in Rapidminer

I'm trying to filter an example set of commercial properties in rapidminer. Many of the properties are duplicated because the property transaction history is included in the data table, and many of the properties been sold more than once over the period of the data table. What I want to do is to filter out all but the most recent transaction for each property.
I can't figure out how to filter all but the record with the most recent transaction date. Any help would be appreciated.
You should post a standalone reproducible example that includes data to show what you have tried so far.
Without this, the general advice might be along these lines. Use the Aggregate operator to find the maximum date for a given property then use the Join operator to inner join the original example set with the example set containing maxima.
Here's a toy example using the Iris data set that might be applicable in your case.
<?xml version="1.0" encoding="UTF-8"?><process version="7.4.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.4.000" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="7.4.000" expanded="true" height="68" name="Retrieve Iris" width="90" x="112" y="187">
<parameter key="repository_entry" value="//Samples/data/Iris"/>
</operator>
<operator activated="true" class="aggregate" compatibility="7.4.000" expanded="true" height="82" name="Aggregate" width="90" x="313" y="187">
<list key="aggregation_attributes">
<parameter key="a1" value="maximum"/>
</list>
<parameter key="group_by_attributes" value="label"/>
</operator>
<operator activated="true" class="join" compatibility="7.4.000" expanded="true" height="82" name="Join" width="90" x="514" y="187">
<parameter key="use_id_attribute_as_key" value="false"/>
<list key="key_attributes">
<parameter key="label" value="label"/>
<parameter key="a1" value="maximum(a1)"/>
</list>
</operator>
<connect from_op="Retrieve Iris" from_port="output" to_op="Aggregate" to_port="example set input"/>
<connect from_op="Aggregate" from_port="example set output" to_op="Join" to_port="right"/>
<connect from_op="Aggregate" from_port="original" to_op="Join" to_port="left"/>
<connect from_op="Join" from_port="join" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>

How to load transaction (basket) data in RapidMiner for association rule?

I have comma separated transaction (basket) data in itemsets format
citrus fruit,semi-finished,bread,margarine
tropical fruit,yogurt,coffee,milk
yogurt,cream,cheese,meat spreads
etc
where each row indicates the items purchased in a single transaction.
By using Read.CSV operator i loaded this file in RapidMiner. I could not find any operator to transform this data for FP-growth and association rule mining.
Is there any way to read such type of file in RapidMiner for association rule mining?
I finally understood what you meant - sorry I was being slow. This can be done using operators from the Text Processing Extension. You have to install this from the RapidMiner repository. Once you have you can try this process.
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="7.0.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.0.000" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="read_csv" compatibility="7.0.000" expanded="true" height="68" name="Read CSV" width="90" x="246" y="85">
<parameter key="csv_file" value="C:\Temp\is.txt"/>
<parameter key="column_separators" value="\r\n"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations"/>
<parameter key="encoding" value="windows-1252"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="att1.true.polynominal.attribute"/>
</list>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="7.0.000" expanded="true" height="82" name="Nominal to Text" width="90" x="380" y="85"/>
<operator activated="true" class="text:process_document_from_data" compatibility="7.0.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="514" y="85">
<parameter key="vector_creation" value="Term Occurrences"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="7.0.000" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34">
<parameter key="mode" value="specify characters"/>
<parameter key="characters" value=","/>
</operator>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_op="Read CSV" from_port="output" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
The trick is to use Read CSV to read the original file in but use end of line as the delimiter. This reads the entire line in as a polynominal attribute. From there, you have to convert this to text so that the text processing operators can do their work. The Process Documents from Data operator is then used to make the final example set. The important point is to use the Tokenize operator to split the lines into words separated by commas.

Build Correlation Matrix to use two different cells of two Csv files in Rapid miner

I want to build correlation matrix with two different cells of two different csv files. Anyone can help me to tell how I can specify one column from one file and same as other file?.
You have to create a new example set by joining the two columns together with the Join operator and then you can calculate the correlation matrix. Make sure when joining that the two example sets have the same ID attribute.
The code block below shows an example process of how to select and join two attributes.
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="6.4.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.0.000-SNAPSHOT" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="7.0.000-SNAPSHOT" expanded="true" height="60" name="Retrieve Iris" width="90" x="45" y="75">
<parameter key="repository_entry" value="//Samples/data/Iris"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="7.0.000-SNAPSHOT" expanded="true" height="76" name="Select Attributes" width="90" x="179" y="75">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="a1"/>
</operator>
<operator activated="true" class="retrieve" compatibility="7.0.000-SNAPSHOT" expanded="true" height="60" name="Retrieve Iris (2)" width="90" x="45" y="255">
<parameter key="repository_entry" value="//Samples/data/Iris"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="7.0.000-SNAPSHOT" expanded="true" height="76" name="Select Attributes (2)" width="90" x="179" y="255">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="a2"/>
</operator>
<operator activated="true" class="join" compatibility="7.0.000-SNAPSHOT" expanded="true" height="76" name="Join" width="90" x="380" y="165">
<list key="key_attributes"/>
</operator>
<operator activated="true" class="correlation_matrix" compatibility="7.0.000-SNAPSHOT" expanded="true" height="94" name="Correlation Matrix" width="90" x="581" y="165"/>
<connect from_op="Retrieve Iris" from_port="output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Join" to_port="left"/>
<connect from_op="Retrieve Iris (2)" from_port="output" to_op="Select Attributes (2)" to_port="example set input"/>
<connect from_op="Select Attributes (2)" from_port="example set output" to_op="Join" to_port="right"/>
<connect from_op="Join" from_port="join" to_op="Correlation Matrix" to_port="example set"/>
<connect from_op="Correlation Matrix" from_port="matrix" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>