SSIS supports Power Query as a Source, but do not seem to support M code using Html.Table. I'm using only SQL Server Data Tools (SSDT)
M code below=
let Scrapper =
(Page as number) as table =>
let
Source = Web.BrowserContents("https://www.zerohedge.com/?page=" & Number.ToText(Page)),
#"Extracted Table From Html" = Html.Table(Source, {{"Title", ".teaser-title:nth-last-child(4)"}, {"Date", ".extras__created:nth-last-child(1)"}, {"Views", ".extras__views:nth-last-child(2)"}}, [RowSelector=".view-content:nth-last-child(2) > DIV.views-row"]),
#"Changed Type" = Table.TransformColumnTypes(#"Extracted Table From Html",{{"Title", type text}, {"Date", type datetime}, {"Views", Int64.Type}})
in
#"Changed Type",
Source = {0..1},
#"Converted to Table" = Table.FromList(Source, Splitter.SplitByNothing(), null, null, ExtraValues.Error),
#"Renamed Columns" = Table.RenameColumns(#"Converted to Table",{{"Column1", "Pages"}}),
#"Invoked Custom Function" = Table.AddColumn(#"Renamed Columns", "Scrapper", each Scrapper([Pages])),
#"Expanded Scrapper" = Table.ExpandTableColumn(#"Invoked Custom Function", "Scrapper", {"Title", "Date", "Views"}, {"Title", "Date", "Views"})
in
#"Expanded Scrapper"
Error: 0x0 at Data Flow Task, Power Query Source: The import
Html.Table matches no exports. Did you miss a module reference?
I don't see any other solution that wait for Microsoft to support Html.table
error image in SSDT
Related
i want to put a Panda Dataframe to a IBM i Series / AS400. I already researched a much, but now I am stuck.
I already made a lot of queries, where I use pyodbc. For df.to_sql() I should use, as readed on other stacks, sqlalchemy with the ibm_db_sa dialect.
My actual code is:
CONNECTION_STRING = (
"driver={iSeries Access ODBC Driver};"
"System=111.111.111.111;"
"database=TESTDB;"
"uid=USER;"
"pwd=PASSW;"
)
quoted = urllib.parse.quote_plus(CONNECTION_STRING)
engine = create_engine('ibm_db_sa+pyodbc:///?odbc_connect={}'.format(quoted))
create_statement = df.to_sql("TABLETEST", engine, if_exists="append")
the following packages are installed
python 3.9
ibm-db 3.1.3
ibm-db-sa 0.3.7
ibm-db-sa-py3 0.3.1.post1
pandas 1.3.5
pip 22.0.4
setuptools 57.0.0
SQLAlchemy 1.4.39
when I run, i get the following error:
sqlalchemy.exc.ProgrammingError: (pyodbc.ProgrammingError) ('42S02', '[42S02] [IBM][System i Access ODBC Driver][DB2 for i5/OS]SQL0204 - COLUMNS in SYSCAT type *FILE not found. (-204) (SQLPrepare)')
[SQL: SELECT "SYSCAT"."COLUMNS"."COLNAME", "SYSCAT"."COLUMNS"."TYPENAME", "SYSCAT"."COLUMNS"."DEFAULT", "SYSCAT"."COLUMNS"."NULLS", "SYSCAT"."COLUMNS"."LENGTH", "SYSCAT"."COLUMNS"."SCALE", "SYSCAT"."COLUMNS"."IDENTITY", "SYSCAT"."COLUMNS"."GENERATED"
FROM "SYSCAT"."COLUMNS"
WHERE "SYSCAT"."COLUMNS"."TABSCHEMA" = ? AND "SYSCAT"."COLUMNS"."TABNAME" = ? ORDER BY "SYSCAT"."COLUMNS"."COLNO"]
[parameters: ('USER', 'TABLETEST')]
(Background on this error at: https://sqlalche.me/e/14/f405)
I think, the dialect could be wrong, because the parameters are the username and the table for the ODBC connection?
AND: I am not really sure, whats the difference between ibm_db_sa and ibm_db?
I tried a few days again, before someone is trying to do this via sqlalchemy should do it via pyodbc.
Here is my working example
refering the df_to_sql_bulk_insert function to this
(and now I am currently using my system-DSN):
def df_to_sql_bulk_insert(df: pd.DataFrame, table: str, **kwargs) -> str:
df = df.copy().assign(**kwargs)
columns = ", ".join(df.columns)
tuples = map(str, df.itertuples(index=False, name=None))
values = re.sub(r"(?<=\W)(nan|None)(?=\W)", "NULL", (",\n" + " " * 7).join(tuples))
return f"INSERT INTO {table} ({columns})\nVALUES {values}"
cnxn = pyodbc.connect("DSN=XXX")
cursor = cnxn.cursor()
sqlstr = df_to_sql_bulk_insert(df,"DBXXX.TBLXXX")
cursor.execute(sqlstr)
cnxn.commit()
The data has HTML values inside text:
col1
-------------------------------------------------------------
Drell-Yan Process Background Estimation Using eμ Method
Expressions of constant π
Computational Analysis of Protein β-Structure
δ13C and 14C Measurements in Aerosol Particles
I need to get actual symbols instead of all the HTML decimal values.
Html.Table in powerquery-m can decode HTML decimal values into visible symbols:
let
Source = Table.FromRows(Json.Document(Binary.Decompress(Binary.FromText("PY7BCsIwDIZfJcyrg03nUHaa06MwEA8ydyg1arFrR5KCvr1l4C7JId/3/+m65EBobXpVDlryGplhr/T7ST64OxxZzKDEeAcXNu4JeAtZtioXu01ZwQnl5e9Jv+yS42ek6EaQwT9Axy3KCfzxMqsmrvHDGGRKVBbqOL5sJiW2Cxo3G0VZpWehoCUQTu582Vb5ugFDkBdNfEJxJAZ0whD9Gsmzt9AqEqMtctL3Pw==", BinaryEncoding.Base64), Compression.Deflate)), let _t = ((type nullable text) meta [Serialized.Text = true]) in type table [col1 = _t]),
#"Changed Type" = Table.TransformColumnTypes(Source,{{"col1", type text}}),
#"Added Custom" = Table.AddColumn(#"Changed Type", "HtmlTable", each Html.Table([col1],{{"HtmlDecoded",":root"}})),
#"Expanded HtmlTable" = Table.ExpandTableColumn(#"Added Custom", "HtmlTable", {"HtmlDecoded"}, {"HtmlDecoded"})
in
#"Expanded HtmlTable"
Or in place:
let
Source = Table.FromRows(Json.Document(Binary.Decompress(Binary.FromText("PY7BCsIwDIZfJcyrg03nUHaa06MwEA8ydyg1arFrR5KCvr1l4C7JId/3/+m65EBobXpVDlryGplhr/T7ST64OxxZzKDEeAcXNu4JeAtZtioXu01ZwQnl5e9Jv+yS42ek6EaQwT9Axy3KCfzxMqsmrvHDGGRKVBbqOL5sJiW2Cxo3G0VZpWehoCUQTu582Vb5ugFDkBdNfEJxJAZ0whD9Gsmzt9AqEqMtctL3Pw==", BinaryEncoding.Base64), Compression.Deflate)), let _t = ((type nullable text) meta [Serialized.Text = true]) in type table [col1 = _t]),
#"Changed Type" = Table.TransformColumnTypes(Source,{{"col1", type text}}),
Decoded = Table.TransformColumns(#"Changed Type", {{"col1", each Table.FirstValue(Html.Table(_,{{"HtmlDecoded",":root"}})) }} )
in
Decoded
I'm using the R package monitoR and getting an error message that I can't figure out.
I'm trying to upload a correlation template list ("bithTemps") to a MySQL database ("noh") using the dbUploadTemplate command.
dbUploadTemplate(templates = bithTemps,
uid = "root",
pwd = "****",
db.name = "noh",
analyst = 1,
locationID = "2",
date.recorded = "2017/09/07",
recording.equip = "Unknown",
species.code = "BITH",
type = "COR")
This returns:
Error: $ operator is invalid for atomic vectors
I have confirmed the ODBC connection is working, that the template list is functional (i.e., it works when called to other arguments in the package), and that the SQL database has the required entries for analyst, location, and species code.
It seems that this error was actually triggered by a non-functional ODBC connection. This part of the dbUploadTemplate function
species <- RODBC::sqlQuery(dbCon, paste("SELECT `pkSpeciesID`, `fldSpeciesCode` FROM `tblSpecies` WHERE `fldSpeciesCode` = '",
paste(species.code, sep = "", collapse = "' OR `fldSpeciesCode` = '"),
"'", sep = ""))
queries a table in the SQL database and returns an object called 'species'. If the query fails (e.g., because RODBC can't connect) than 'species' is empty, and the following operation
speciesID <- NULL
for (i in 1:length(species.code)) {
speciesID[i] <- species$pkSpeciesID[species$fldSpeciesCode ==
species.code[i]]
}
triggers the error. Fixing the ODBC connection resolves the error.
I'm trying to merge several columns in a table to columns of another table. Each column in the primary table contains texts while the PrimaryAnalysis table contains indexes for the texts. I'd like to create columns of indexes for the primary table but I'm having to do that one at a time for each table thus:
#"Merged Queries" = Table.NestedJoin(#"Changed Type2",{"Text.1"},PrimaryAnalysis,{"Letter"},"NewColumn"),
#"Expanded NewColumn" = Table.ExpandTableColumn(#"Merged Queries", "NewColumn", {"Index"}, {"Index"}),
#"Renamed Columns2" = Table.RenameColumns(#"Expanded NewColumn",{{"Index", "First"}}),
#"Merged Queries1" = Table.NestedJoin(#"Renamed Columns2",{"Text.2"},PrimaryAnalysis,{"Letter"},"NewColumn"),
#"Expanded NewColumn1" = Table.ExpandTableColumn(#"Merged Queries1", "NewColumn", {"Index"}, {"Index"}),
#"Renamed Columns3" = Table.RenameColumns(#"Expanded NewColumn1",{{"Index", "2nd"}}),
#"Merged Queries2" = Table.NestedJoin(#"Renamed Columns3",{"Text.3"},PrimaryAnalysis,{"Letter"},"NewColumn"),
#"Expanded NewColumn2" = Table.ExpandTableColumn(#"Merged Queries2", "NewColumn", {"Index"}, {"Index"}),
#"Renamed Columns4" = Table.RenameColumns(#"Expanded NewColumn2",{{"Index", "3rd"}}),
Now I have to do that for 23 columns. Is there a way to implement DO...Repeat or any other loop in Power Query to perform this task?
Thanks in advance.
One way to loop in Power Query is to use a recursive function.
In the code below I read an Excel file with a table that should be similar to your primary table (so the step #"Changed Type2" in the code below should be similar to your step #"Changed Type2").
Next a function AddIndices is defined in which 1 column with an Index is added in each iteration. After 23 iterations the function stops, otherwise it calls itself.
An important point of attention with such recursive functions is that it MUST include a Table.Buffer (see step "Expanded"), otherwise in each iteration the code tries to evaluate all former iterations again and gets stuck. Table.Buffer prevents this.
In the last step of the query, the function is invoked.
let
Source = Excel.Workbook(File.Contents("C:\Users\Marcel\Documents\Forum bijdragen\StackOverflow Power Query\Loop Computation in Power Query.xlsx"), null, true),
Tabel1_Table = Source{[Item="Tabel1",Kind="Table"]}[Data],
#"Changed Type2" = Table.TransformColumnTypes(Tabel1_Table,{{"Text.1", type text}, {"Text.2", type text}, {"Text.3", type text}, {"Text.4", type text}, {"Text.5", type text}, {"Text.6", type text}, {"Text.7", type text}, {"Text.8", type text}, {"Text.9", type text}, {"Text.10", type text}, {"Text.11", type text}, {"Text.12", type text}, {"Text.13", type text}, {"Text.14", type text}, {"Text.15", type text}, {"Text.16", type text}, {"Text.17", type text}, {"Text.18", type text}, {"Text.19", type text}, {"Text.20", type text}, {"Text.21", type text}, {"Text.22", type text}, {"Text.23", type text}}),
// Recursive function:
AddIndices = (TableSoFar as table, optional Iteration as number) as table =>
let
CurrentIteration = if Iteration = null then 1 else Iteration,
CurrentColumn = "Text."&Text.From(CurrentIteration),
NewIndexColumn = "Index."&Text.From(CurrentIteration),
MergedTable = Table.NestedJoin(TableSoFar,{CurrentColumn},PrimaryAnalysis,{"Letter"},"NewColumn"),
Expanded = Table.Buffer(Table.ExpandTableColumn(MergedTable, "NewColumn", {"Index"}, {NewIndexColumn})),
Result = if CurrentIteration = 23 then Expanded else #AddIndices(Expanded, CurrentIteration + 1)
in
Result,
// Call recursive function:
AddedIndices = AddIndices(#"Changed Type2")
in
AddedIndices
I am running an R script that adds data to mySQL database. I usually format the data and add it as as it comes in after couple of hours (data string is not continuous). My first set of data was added properly in MySQL database. The second string of data can not be added properly.
con = dbConnect(MySQL(), user='root', password='xxxxxx', dbname='test', host='localhost')
dbWriteTable(con, 'Tables', value = parseTweets(filterStream(file.name= "", track=c("lebron"), timeout=10, oauth=my_oauth)))
When I rerun the last code (dbWriteTable) again, it gives me following error
Error: Error in .local(conn, statement, ...) :
could not run statement: Table 'tables' already exists
I also used
dbWriteTable(con, 'Tables', value = parseTweets(filterStream(file.name= "", track=c("lebron"), timeout=10, oauth=my_oauth)), append = TRUE)
but it provides the same error
For some reason giving setting append to TRUE is not working. Instead give it a number. See the code below for better understanding
dbWriteTable(con, 'Tables', value = parseTweets(filterStream(file.name= "", track=c("lebron"), timeout=100, oauth=my_oauth)), overwrite = 0, row.names = 0, append = 1)