PowerQuery: Function to get Duplicates info for given Columnnames - function
I need Some function in PowerQuery to get Additional Columns for duplicated data (not just keep/remove duplicates)
Example:
For the given table I want to get following info for duplicated columns set {"Date", "Product", "Color"}:
Minimal RowId - basicaly, Id of the 1st occurence of data
Nr. of Duplicate - duplicates counter within MinRowId group
NB! For non duplicates it should return null values
try grouping then expanding in powerquery
let Source = Excel.CurrentWorkbook(){[Name="Table1"]}[Content],
#"Grouped Rows" = Table.Group(Source, {"Product", "Color"}, {
{"data", each Table.AddIndexColumn(_, "nDupl", 0, 1, Int64.Type), type table},
{"MinRowID", each List.Min(_[RowId]), type number}
}),
#"Expanded data" = Table.ExpandTableColumn(#"Grouped Rows", "data", {"RowId", "Date", "amount", "nDupl"}, {"RowId", "Date", "amount", "nDupl"})
in #"Expanded data"
Please try following function (download):
Function call Example:
tfnAddDuplicatesInfo2(Source,{"Product","Color","Date"},"DuplInfo" ,"RowId")
Function Arguments:
srcTable as table, // input Table
inGroupBy as list, // List of ColumnNames to search duplicates
outDuplInfo as text, // Output ColumnName for Information about Duplicates - Duplicate number and Minimal RowId (if inRowId provided) within a group
optional inRowId as nullable text // RowId ColumnName - required for outMinRowId calculation for inGroupBy columns
Function body:
let
func = (
srcTable as table, // input Table
inGroupBy as list, // List of ColumnNames to search duplicates
outDuplInfo as text, // Output ColumnName for Information about Duplicates - Duplicate number and Minimal RowId (if inRowId provided) within a group
optional inRowId as nullable text // RowId ColumnName - required for outMinRowId calculation for inGroupBy columns
) =>
let
Source = srcTable,
// // To test as script
// inGroupBy = {"Product", "Color","Date"},
// outDuplInfo = "DuplInfo",
// inRowId = "RowId", // null, "RowId",
//> == Variables ===================================================
Columns2Expand = List.Combine({List.Difference(Table.ColumnNames(Source),inGroupBy),{"__outDuplCounter__"}}),
srcType = Value.Type(Source),
srcTypeRow=
Type.ForRecord(
Record.Combine(
{
Type.RecordFields(Type.TableRow(srcType)),
Type.RecordFields(type [__outDuplCounter__= Int64.Type])
}
),
false
),
RowIdType = if inRowId<>null then Type.TableColumn(srcType,inRowId) else Any.Type, // Stores Column Typename
//< == Variables ===================================================
#"Grouped Rows" = Table.Group(
Source,
inGroupBy,
{
{"__tmpCount__" , each Table.RowCount(_), Int64.Type},
{"__MinGroupRowId__", each if inRowId<> null then List.Min( Record.Field(_,inRowId) ) else null, RowIdType},
{"__AllRows__" , each Table.AddIndexColumn(_, "__outDuplCounter__", 0, 1, Int64.Type), type table srcTypeRow}
}
),
#"Expanded __AllRows__" = Table.ExpandTableColumn(#"Grouped Rows", "__AllRows__", Columns2Expand),
nulls4MinRowId = Table.ReplaceValue(#"Expanded __AllRows__",each [__tmpCount__]<=1, null,
(currentValue, isConditionTrue, replacementValue) => if isConditionTrue then null else currentValue, // Replace.Value function
if inRowId<>null then {"__MinGroupRowId__","__outDuplCounter__"} else {"__outDuplCounter__"}
),
Add_outDuplInfo =
if inRowId<> null then
Table.AddColumn(nulls4MinRowId, outDuplInfo,
each
if [__outDuplCounter__]=null
then null
else [MinRowId=[__MinGroupRowId__], nDupl = [__outDuplCounter__]] ,
type nullable [MinRowId = RowIdType, nDupl = Int64.Type]
)
else
Table.AddColumn(nulls4MinRowId, outDuplInfo, each [__outDuplCounter__], Int64.Type),
Result_tfnAddDuplMinRowId = Table.SelectColumns(Add_outDuplInfo, List.Combine({Table.ColumnNames(Source),{outDuplInfo}}))
in
Result_tfnAddDuplMinRowId,
documentation = [
Documentation.Name = " tfnAddDuplicatesInfo2 ",
Documentation.Description = " Adds two info columns for Duplicates - 1st occurence RowId and given group Occurence Number",
Documentation.LongDescription = " Adds two info columns for Duplicates - 1st occurence RowId and given group Occurence Number",
Documentation.Category = " Running Total ",
Documentation.Source = " ",
Documentation.Version = " 1.0 ",
Documentation.Author = " Denis Sipchenko ",
Documentation.Examples = {
[
Description = "tfnAddDuplicatesInfo2 arguments: ",
Code = "
srcTable as table, // input Table
inGroupBy as list, // List of ColumnNames to search duplicates
outDuplInfo as text, // Output ColumnName for Information about Duplicates - Duplicate number and Minimal RowId (if inRowId provided) within a group
optional inRowId as nullable text // RowId ColumnName - required for outMinRowId calculation for inGroupBy columns",
Result =""
],
[
Description = "tfnAddDuplicatesInfo2 function call example ",
Code = "
let Source = Table.FromRows(Json.Document(Binary.Decompress(Binary.FromText(""hZTBasMwEET/xWdDdteSbP9CT4U2h2JyCK1oQ0xS3IT8frUpWsmSqpxs4ccw2pn1NDXYtA3CBsYNAZE7PNn96cc93+w8n2/uZWwBml07NfwVTIS+nN+PK1SDZzuW1RG7PX3Y5Wb3y4r3uHKHDgrSz9fle7buRQ2e1e5EpuA4sORZw+x/NgIvtnu2jbGP42G5rMS73sMDw0MdlhuODKua68Ai8KT7CH49fH5dVqOOaI6QoO5DCX1PkeraKDTnSKquLdNDjhGLvgMtsE6NZHUKrEnrVBPuU8/F0El6jRykox+UlSR45DCJamEGmODhhpERGNOa5BeNaErrna0NSU3ovpJjXVpqQip1LcGLbZSVJJ1OMLsjBtcm/Y8Ux43BCwcKxa0s0UPqPC84/hV89ws="", BinaryEncoding.Base64), Compression.Deflate)), let _t = ((type nullable text) meta [Serialized.Text = true]) in type table [RowId = Int64.Type, Date = date, Product = _t, Color = _t, Amount = Currency.Type])
in
tfnAddDuplicatesInfo2(Source,{""Product"",""Color"",""Date""},""DuplInfo"" ,""RowId"")
",
Result = "Adds to Source table ""DuplInfo"" column with records:
""MinRowId"" - Minimal RowId within within given group,
""nDupl"" - given group Occurence Number
"
],
[
Description = "tfnAddDuplicatesInfo2 function short call example ",
Code = "
let Source = Table.FromRows(Json.Document(Binary.Decompress(Binary.FromText(""hZTBasMwEET/xWdDdteSbP9CT4U2h2JyCK1oQ0xS3IT8frUpWsmSqpxs4ccw2pn1NDXYtA3CBsYNAZE7PNn96cc93+w8n2/uZWwBml07NfwVTIS+nN+PK1SDZzuW1RG7PX3Y5Wb3y4r3uHKHDgrSz9fle7buRQ2e1e5EpuA4sORZw+x/NgIvtnu2jbGP42G5rMS73sMDw0MdlhuODKua68Ai8KT7CH49fH5dVqOOaI6QoO5DCX1PkeraKDTnSKquLdNDjhGLvgMtsE6NZHUKrEnrVBPuU8/F0El6jRykox+UlSR45DCJamEGmODhhpERGNOa5BeNaErrna0NSU3ovpJjXVpqQip1LcGLbZSVJJ1OMLsjBtcm/Y8Ux43BCwcKxa0s0UPqPC84/hV89ws="", BinaryEncoding.Base64), Compression.Deflate)), let _t = ((type nullable text) meta [Serialized.Text = true]) in type table [RowId = Int64.Type, Date = date, Product = _t, Color = _t, Amount = Currency.Type])
in
tfnAddDuplicatesInfo2(Source,{""Product"",""Color"",""Date""},""nDupl"")
",
Result = "Adds to Source table one column:
""nDupl"" - given group Occurence Number
"
]
}
]
in
Value.ReplaceType(func, Value.ReplaceMetadata(Value.Type(func), documentation))
P.S. Idea about group & expand index column borrowed from horseyride post.
P.S.S. Initially, I took as a source Running Total by Category by Rick de Groot. And than reworked it.
Related
Powerquery: table function with variable parameter list length
How to write table function with non fixed parameter list length? Particular simplified example: I want to write function trimupper(TableName,ColumnName1,ColumnName2,...) that combines just two steps for given set of columns: TRIM whitespaces UPPERCASE text Example for two columns case: (tbl as table, cn1 as text, cn2 as text) => let #"Trimmed Text" = Table.TransformColumns(tbl,{{cn1, Text.Trim , type text}, {cn2, Text.Trim , type text}}), #"Uppercased Text" = Table.TransformColumns(tbl,{{cn1, Text.Upper, type text}, {cn2, Text.Upper, type text}}), trimupperResult = #"Uppercased Text" in trimupperResult But how to do it for variable number of ColumnNames?
let Source = Excel.CurrentWorkbook(){[Name="Table1"]}[Content], changethem = transform (Source,{"ColumnName1","ColumnName2"}) in changethem function transform (Table as table, columnnames as list) => let columnnames = if columnnames = null then Table.ColumnNames(Table) else columnnames, change = Table.TransformColumns( Table, List.Transform(columnnames, each {_, Text.Trim, type text} ) ), change1 = Table.TransformColumns( change, List.Transform(columnnames, each {_, Text.Upper, type text} ) ) in change1
NodeJS MySQL module can't use two placeholders in a query
I'm trying to do this, which is exactly as it is in the documentation: get_ids_query = 'SELECT ?? from ?? WHERE stat = 1 LIMIT 10' then I call the function with two values placed inside variables: var name = Table_Names.Tables_PR[t].name var PK = Table_Names.Tables_PR[t].PK ids = await this.getIds(PK, name) This is the function: async getIds(conf1, conf2) { return await this.mydb.query(this.get_ids_query, conf1, conf2) } and these are the logs: console.log(PK) console.log(name) console.log(mysql.format(this.get_ids_query, conf1, conf2)) output: idusers users SELECT `idusers` from ?? WHERE stat = 1 LIMIT 10 I've also tried: var name = [Table_Names.Tables_PR[t].name] var PK = [Table_Names.Tables_PR[t].PK] which logs like this and it still returns the same query: [ 'idusers' ] [ 'users' ] query format: SELECT `idusers` from ?? WHERE stat = 1 LIMIT 10 What am I doing wrong here? why is it reading the first placeholder but won't read the second one?
Pattern for rest query params in flask
Is there a pattern for dealing with query params in a flask rest server? I know I can create a sql query word for word using string manipulation in python, but I find that to be ugly and error prone, I was wondering if there is a better way. Here's what I have: param1 = request.args.get('param1', type = int) param2 = request.args.get('param2', type = int) if param1 is not None: if param2 is not None: cursor.execute("SELECT * FROM table WHERE p1 = %s AND p2 = %s", (str(param1), str(param2))) else: cursor.execute("SELECT * FROM table WHERE p1 = %s", (str(param1),)) else: if param2 is not None: cursor.execute("SELECT * FROM table WHERE p2 = %s", (str(param2),)) else: cursor.execute("SELECT * FROM table") It's easy to see the number of possible SQL statements is 2 to the number of parameters, which grows out of control... so, again, without using string manipulation to custom build the sql query, is there an idiom or pattern that is used to accomplish this in a more elegant way? Thanks.
Loop through your parameters. params = [] for i in range(1, HoweverManyParamsYouNeed): params.append(request.args.get('param' + str(i), type = int)) s = "" for i in range(1, len(params)): if params[ i ] is not None: if not s: s = "p" + str(i) + " = " + str(params[ i ]) else: s = s + " AND p" + str(i) + " = " + str(params[ i ]) full = "SELECT * FROM table" if s: full = full + " WHERE " + s cursor.execute(full) You might need to correct this code, since I do not have a way to run it.
I suggest using ORM(https://en.wikipedia.org/wiki/Object-relational_mapping) instead of raw sql queries. First you need install flask-sqlalchemy (https://flask-sqlalchemy.palletsprojects.com/) Then define your model class MyModel(db.Model): id = db.Column(db.Integer, primary_key=True) column1 = db.Column(db.Integer) column2 = db.Column(db.Integer) Let's say you have your filter lookup somewhere allowed_filters = {"column1", "column2"} Finally instead of cursor you can use SQLAlchemy's ORM to retrieve your filtered objects. query = MyModel.query for field, value in request.args.items(): if field in allowed_filters: query = query.filter(getattr(MyModel, field) == value) my_object_list = list(query.all()) If you really want to create your queries manually you can always iterate over args: where_clause = "" params = [] for field, value in request.args.items(): if field in allowed_filters: if len(where_clause) > 0: where_clause += " AND " where_clause += "{} = %s".format(field) params.append(value) if len(where_clause) > 0: cursor.execute("SELECT * FROM table WHERE {}".format(where_clause), tuple(params)) else: cursor.execute("SELECT * FROM table")
How to create an sql statement with optional parameters
I have a lessons table that contains the following fields: id title type language level The user through the interface can select witch lesson he wants to open. He will start selecting the language, then the type and finally the level. During this process I want to query the database using a single sql statement, but of course the first query will have only the language field. I came up with this syntax but it does not work: function queryLessonList (language, type, level){ const values = [language, type, level]; const sql = "SELECT * FROM lessons WHERE (language=?) AND (? is null OR type=?) AND (? is null OR level=?)"; return query(sql, values); } How can I make it work?
To reduce the complexity of checking variables and building out the query, instead you can pass the function an object to match, what you want and the columns you want returning etc (as * is not ideal). So something like: function queryLessonList (where = {}, columns = ['*']) { let keys = Object.keys(where) let values = Object.values(where) columns = !columns.length || columns[0] === '*' ? '*': columns.map(e => '`'+e+'`').join(',') let sql = ` SELECT ${columns} FROM lessons ${keys.length ? 'WHERE \`'+keys.join('` = ? AND `')+'\` = ?' : ''} ` return query(sql, values) } /* SELECT * FROM lessons WHERE `language` = ? AND `type` = ? */ queryLessonList({ language: 'en', type: 'foo' }, []) /* SELECT `id` FROM lessons */ queryLessonList({}, ['id']) /* SELECT * FROM lessons */ queryLessonList()
Generate n-gram for a specific column present in mysql db
I'm writing a code to generate n-grams for every record in the table by reading a specific column. def extract_from_db(inp_cust_id): sql_db = TatDBHelper() t_sql = "select notes from raw_data where customer_id = {0}" db_data = sql_db.execute_read(t_sql.format(inp_cust_id)) for row in db_data: text = row.values() bi_grams = generate_ngrams(text[0].encode("utf-8"), 2) print bi_grams def generate_ngrams(sentence, n): sentence = sentence.lower() sentence = re.sub(r'[^a-zA-Z0-9\s]', ' ', sentence) tokens = [token for token in sentence.split(" ") if token != ""] ngrams = zip(*[tokens[i:] for i in range(n)]) return [" ".join(ngram) for ngram in ngrams] I'm getting the output like: ['i highly', 'highly recommend', 'recommend it'] ['the penguin', 'penguin encounter', 'encounter was', 'was awesome'] I want the output to look like below, can anybody help me to get this. ['i highly', 'highly recommend', 'recommend it', ... ]
creat another list all_ngrams, and keep appending the values to it , using .extend(), and finally you will have all the ngrams in one list. Try this : def extract_from_db(inp_cust_id): sql_db = TatDBHelper() t_sql = "select notes from raw_data where customer_id = {0}" db_data = sql_db.execute_read(t_sql.format(inp_cust_id)) all_ngrams = [] for row in db_data: text = row.values() bi_grams = generate_ngrams(text[0].encode("utf-8"), 2) all_ngrams.extend(bi_grams) print all_ngrams