I'm doing the multi-language translation job,
As you can see, the right document should be replaced with the left document if they are matched.
How could I do it with Sublime text?
Because they are not 1-1 mapping
My expected result is
I wrote a proof of concept that allows you to replace matched selections between a source ( left ) & target ( right ) file.
DEMO:
USAGE:
To use MatchReplace:
copy the plugin folder to your Packages directory
edit the replacementKeys array in the run function to suit your match+replace needs
save the plugin & restart SublimeText
open a 2-group window layout using Shift + Alt + 2
move the document you want as the SOURCE to the left group
move the document you want to MATCH with the source to the right group
open the Command Palette with Ctrl + Shift + P and run the Match Replace: Demo command
Note:
replacementKeys must be an exact match at both documents ( leading whitespace is ignored ).
If you want to allow for variation within the replacementKeys, you will need to implement an additional layer of RegEx processing.
Implementation:
get active views of each group in a 2 group window
find RegEx matches of a user-defined array of keys which will be matched at both documents
store value regions of both documents, and string values of the source document
sort stored values by region
iterate over regions at the target document, replacing all matched values from the source
The demo is written to work with single tier JSON files, but can be adjusted as necessary.
RegEx patterns to precede & follow replacementKeys are:
queryPrefix
querySuffix
CODE:
The script works a bit more smoothly with a custom Edit module of mine, so I recommend that you download the entire plugin here:
# GitHub
import sublime, sublime_plugin
import Edit
import operator
class MatchReplaceDemoCommand ( sublime_plugin.TextCommand ):
def run ( self, edit ):
replacementKeys = []
#■■■ Populate With Keys To Be Replaced In Document2 ■■■#
replacementKeys.append ( "flight_number" )
replacementKeys.append ( "price" )
replacementKeys.append ( "payment" )
self.replace_KeyValues ( replacementKeys )
def replace_KeyValues ( self, replacementKeys ):
window = self.view.window()
document1_ResultStrings = {}
document2_ResultRegions = {}
#■■■ Verify : 2 Active Window Groups ■■■#
windowGroup_Count = window.num_groups()
if windowGroup_Count != 2:
return
#■■■ Set : Document Views ■■■#
document1 = window.active_view_in_group ( 0 ) # Document 1 == SOURCE
document2 = window.active_view_in_group ( 1 ) # Document 2 == MATCH
edit = Edit.get ( document2 )
#■■■ Set : Seach Parameters ■■■#
query_StartPosition = 0
queryPrefix = "((^)|(^[\t\ ]+))"
querySuffix = ":"
#■■■ Store : KeyValue Regions & Strings ■■■#
for key in replacementKeys:
#■■■ Find Document1 Key Regions & Strings ■■■#
document1_KeyRegion = document1.find ( queryPrefix + key + querySuffix, query_StartPosition )
document1_ResultRegion_Start = document1_KeyRegion.b
document1_ResultRegion_End = document1.line ( document1_KeyRegion ).b
document1_ResultRegion = sublime.Region ( document1_ResultRegion_Start, document1_ResultRegion_End )
document1_ResultString = document1.substr ( document1_ResultRegion )
#■■■ Find Document2 Key Regions ■■■#
document2_KeyRegion = document2.find ( queryPrefix + key + querySuffix, query_StartPosition )
document2_ResultRegion_Start = document2_KeyRegion.b
document2_ResultRegion_End = document2.line ( document2_KeyRegion ).b
document2_ResultRegion = sublime.Region ( document2_ResultRegion_Start, document2_ResultRegion_End )
#■■■ Verify Match ■■■#
if document1_ResultRegion_Start != -1 \
and document2_ResultRegion_Start != -1:
document1_ResultStrings[ key ] = document1_ResultString
document2_ResultRegions[ key ] = document2_ResultRegion
#■■■ Verify : Matches Found ■■■#
if len ( document1_ResultStrings ) == 0 \
or len ( document2_ResultRegions ) == 0:
return
#■■■ Sort Regions To Avoid Replacement Overlap ■■■#
document2_ResultRegions = sorted ( document2_ResultRegions.items(), key=operator.itemgetter ( 1 ) )
#■■■ Replace Matched KeyValues ■■■#
for key, value in reversed ( document2_ResultRegions ):
replacementField = key
replacementRegion = value
replacementString = document1_ResultStrings[ key ]
edit.replace ( replacementRegion, replacementString )
Related
I have a google bigquery table with a column containing large JSON strings. In each row, there is a different number of keys and nested keys that I would like to flatten into columns.
My table looks as follows:
id
payload
1
{"key1":{"value":"1"},"key2":2,"key3":1,"key4":"abcde,"version":10}
2
{"key1":{"value":"2"},"key2":5,"key3":2,"key4":"defg,"version":11}
I have managed to extract single columns by using the bq functions JSON_EXTRACT_VALUE and/or JSON_EXTRACT_SCALAR:
SELECT id, JSON_EXTRACT_VALUE(payload, '$.key1') as key1
FROM `project.dataset.table`
etc., however I don't want to hand code more than 100 keys which are nested in the JSON column. There has to be a better way!
I am grateful for any kind of support!
Consider below approach
create temp function extract_keys(input string) returns array<string> language js as """
return Object.keys(JSON.parse(input));
""";
create temp function extract_values(input string) returns array<string> language js as """
return Object.values(JSON.parse(input));
""";
create temp function extract_all_leaves(input string) returns string language js as '''
function flattenObj(obj, parent = '', res = {}){
for(let key in obj){
let propName = parent ? parent + '.' + key : key;
if(typeof obj[key] == 'object'){
flattenObj(obj[key], propName, res);
} else {
res[propName] = obj[key];
}
}
return JSON.stringify(res);
}
return flattenObj(JSON.parse(input));
''';
create temp table temp_table as (
select offset, key, value, id
from your_table t,
unnest([struct(extract_all_leaves(payload) as leaves)]),
unnest(extract_keys(leaves)) key with offset
join unnest(extract_values(leaves)) value with offset
using(offset)
);
execute immediate (select '''
select * from (select * except(offset) from temp_table)
pivot (any_value(value) for replace(key, '.', '__') in (''' || keys_list || '''
))'''
from (select string_agg('"' || replace(key, '.', '__') || '"', ',' order by offset) keys_list from (
select key, min(offset) as offset from temp_table group by key
))
);
if applied to sample data as in your question
the output is
I'm working with a table of addresses in Power BI. The table also has a column marking some condition, it could be anything so I'll just label it "condition".
I'm trying to create a column (or measure) showing duplicate addresses. The problem I'm having is that both/all duplicates need to meet that other condition. Rows that don't should just be ignored from the start. I thought these nested IF statements would work:
Duplicate =
IF(
CALCULATE(COUNTROWS(Table),
FILTER(Table,Table[Condition]="Yes")),
IF(
CALCULATE(COUNTROWS(Table),
FILTER(Table,Table[Address]=EARLIER(Table[Address])))>1,
"Duplicate",BLANK()
)
)
But duplicate pairs where only one row meets the condition are still marked. What am I doing wrong?
I need all rows elsewhere so I can't filter the query. Also, I know I could add the condition to the concatenation, but that seems sloppy and I assume there's a more "correct" way to do it.
I don't understand how your outer IF function is supposed to work since the first argument is an integer rather than True/False.
Try this instead:
Duplicate =
IF (
COUNTROWS (
FILTER (
Table,
Table[Condition] = "Yes" &&
Table[Address] = EARLIER ( Table[Address] )
)
) > 1,
"Duplicate",
BLANK ()
)
Edit: As you pointed out, this didn't work exactly as intended. Try one of the following instead:
Duplicate =
IF (
COUNTROWS (
FILTER (
Table,
EARLIER ( Table[Condition] ) = "Yes" &&
Table[Condition] = "Yes" &&
Table[Address] = EARLIER ( Table[Address] )
)
) > 1,
"Duplicate",
BLANK ()
)
or
Duplicate =
IF (
Table[Condition] = "Yes" &&
COUNTROWS (
FILTER (
Table,
Table[Condition] = "Yes" &&
Table[Address] = EARLIER ( Table[Address] )
)
) > 1,
"Duplicate",
BLANK ()
)
We all know these excellent ABAP statements which allows finding unique values in one-liner:
it_unique = VALUE #( FOR GROUPS value OF <line> IN it_itab
GROUP BY <line>-field WITHOUT MEMBERS ( value ) ).
But what about extracting duplicates? Can one utilize GROUP BY syntax for that task or, maybe, table comprehensions are more useful here?
The only (though not very elegant) way I found is:
LOOP AT lt_marc ASSIGNING FIELD-SYMBOL(<fs_marc>) GROUP BY ( matnr = <fs_marc>-matnr
werks = <fs_marc>-werks )
ASSIGNING FIELD-SYMBOL(<group>).
members = VALUE #( FOR m IN GROUP <group> ( m ) ).
IF lines( members ) > 1.
"throw error
ENDIF.
ENDLOOP.
Is there more beautiful way of finding duplicates by arbitrary key?
So, I just put it as answer, as we with Florian weren't able to think out something better. If somebody is able to improve it, just do it.
TYPES tt_materials TYPE STANDARD TABLE OF marc WITH DEFAULT KEY.
DATA duplicates TYPE tt_materials.
LOOP AT materials INTO DATA(material)
GROUP BY ( id = material-matnr
status = material-pstat
size = GROUP SIZE )
ASCENDING REFERENCE INTO DATA(group_ref).
CHECK group_ref->*-size > 1.
duplicates = VALUE tt_materials( BASE duplicates FOR <status> IN GROUP group_ref ( <status> ) ).
ENDLOOP.
Given
TYPES: BEGIN OF key_row_type,
matnr TYPE matnr,
werks TYPE werks_d,
END OF key_row_type.
TYPES key_table_type TYPE
STANDARD TABLE OF key_row_type
WITH DEFAULT KEY.
TYPES: BEGIN OF group_row_type,
matnr TYPE matnr,
werks TYPE werks_d,
size TYPE i,
END OF group_row_type.
TYPES group_table_type TYPE
STANDARD TABLE OF group_row_type
WITH DEFAULT KEY.
TYPES tt_materials TYPE STANDARD TABLE OF marc WITH DEFAULT KEY.
DATA(materials) = VALUE tt_materials(
( matnr = '23' werks = 'US' maabc = 'B' )
( matnr = '42' werks = 'DE' maabc = 'A' )
( matnr = '42' werks = 'DE' maabc = 'B' ) ).
When
DATA(duplicates) =
VALUE key_table_type(
FOR key IN VALUE group_table_type(
FOR GROUPS group OF material IN materials
GROUP BY ( matnr = material-matnr
werks = material-werks
size = GROUP SIZE )
WITHOUT MEMBERS ( group ) )
WHERE ( size > 1 )
( matnr = key-matnr
werks = key-werks ) ).
Then
cl_abap_unit_assert=>assert_equals(
act = duplicates
exp = VALUE tt_materials( ( matnr = '42' werks = 'DE') ) ).
Readability of this solution is so bad that you should only ever use it in a method with a revealing name like collect_duplicate_keys.
Also note that the statement's length increases with a growing number of key fields, as the GROUP SIZE addition requires listing the key fields one by one as a list of simple types.
What about the classics? I'm not sure if they are deprecated or so, but my first think is about to create a table clone, DELETE ADJACENT-DUPLICATES on it and then just compare both lines( )...
I'll be eager to read new options.
For R = 1 To NRrows
If (RSNonResourceCosts![CostType]) <> "" Then
CL(1) = CL(1) + 1
WKS.Cells(199 + R, 1) = (RSNonResourceCosts![CostType])
End If
If (RSNonResourceCosts![SoftwareCosts]) <> "" Then
CL(2) = CL(2) + 1
WKS.Cells(199 + R, 2) = (RSNonResourceCosts![SoftwareCosts])
End If
RSNonResourceCosts.MoveNext
Next R
Attached is a "Cut down" version of the code. I am writing to an Excel spreadsheet to apply Indirect formula. [CostType] and [SoftwareCosts] in the example are constants at the moment.
I want the user to be able to add fields without then having to amend the code. The new field name will be derived from the table. Is it possible to use a field name (not known until the table is modified) between the square brackets?
Yes:
FieldName = "SomeField"
WKS.Cells(199 + R, 1) = RSNonResourceCosts.Fields(FieldName).Value
I have a table named passive than contains a list of timestamped events per user. I want to fill the attribute duration, which correspond to the time between the current row's event and the next event done by this user.
I tried the following query:
UPDATE passive as passive1
SET passive1.duration = (
SELECT min(UNIX_TIMESTAMP(passive2.event_time) - UNIX_TIMESTAMP(passive1.event_time) )
FROM passive as passive2
WHERE passive1.user_id = passive2.user_id
AND UNIX_TIMESTAMP(passive2.event_time) - UNIX_TIMESTAMP(passive1.event_time) > 0
);
This returns the error message Error 1093 - You can't specify target table for update in FROM.
In order to circumvent this limitation, I tried to follow the structure given in https://stackoverflow.com/a/45498/395857, which uses a nested subquery in the FROM clause to create an implicit temporary table, so that it doesn't count as the same table we're updating:
UPDATE passive
SET passive.duration = (
SELECT *
FROM (SELECT min(UNIX_TIMESTAMP(passive2.event_time) - UNIX_TIMESTAMP(passive.event_time))
FROM passive, passive as passive2
WHERE passive.user_id = passive2.user_id
AND UNIX_TIMESTAMP(passive2.event_time) - UNIX_TIMESTAMP(passive1.event_time) > 0
)
AS X
);
However, the passive table in the nested subquery doesn't refer to the same passive as in the main query. Because of that, all rows have the same passive.duration value. How can I refer to the main query's passive in the nested subquery? (or maybe are there some alternative ways to structure such a query?)
Try Like this....
UPDATE passive as passive1
SET passive1.duration = (
SELECT min(UNIX_TIMESTAMP(passive2.event_time) - UNIX_TIMESTAMP(passive1.event_time) )
FROM (SELECT * from passive) Passive2
WHERE passive1.user_id = passive2.user_id
AND UNIX_TIMESTAMP(passive2.event_time) - UNIX_TIMESTAMP(passive1.event_time) > 0
)
;
We can use a Python script to circumvent the issue:
'''
We need an index on user_id, timestamp to speed up
'''
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Download it at http://sourceforge.net/projects/mysql-python/?source=dlp
# Tutorials: http://mysql-python.sourceforge.net/MySQLdb.html
# http://zetcode.com/db/mysqlpython/
import MySQLdb as mdb
import datetime, random
def main():
start = datetime.datetime.now()
db=MySQLdb.connect(user="root",passwd="password",db="db_name")
db2=MySQLdb.connect(user="root",passwd="password",db="db_name")
cursor = db.cursor()
cursor2 = db2.cursor()
cursor.execute("SELECT observed_event_id, user_id, observed_event_timestamp FROM observed_events ORDER BY observed_event_timestamp ASC")
count = 0
for row in cursor:
count += 1
timestamp = row[2]
user_id = row[1]
primary_key = row[0]
sql = 'SELECT observed_event_timestamp FROM observed_events WHERE observed_event_timestamp > "%s" AND user_id = "%s" ORDER BY observed_event_timestamp ASC LIMIT 1' % (timestamp, user_id)
cursor2.execute(sql)
duration = 0
for row2 in cursor2:
duration = (row2[0] - timestamp).total_seconds()
if (duration > (60*60)):
duration = 0
break
cursor2.execute("UPDATE observed_events SET observed_event_duration=%s WHERE observed_event_id = %s" % (duration, primary_key))
if count % 1000 == 0:
db2.commit()
print "Percent done: " + str(float(count) / cursor.rowcount * 100) + "%" + " in " + str((datetime.datetime.now() - start).total_seconds()) + " seconds."
db.close()
db2.close()
diff = (datetime.datetime.now() - start).total_seconds()
print 'finished in %s seconds' % diff
if __name__ == "__main__":
main()