Insert an SQL Server Table rows into a Bigquery table in one bloc
up vote
3
down vote
favorite
I want to insert all rows of an SQL server Table into a BigQuery Table having the same schema.
The streaming insert row by row is very slow: to insert 1000 rows the execution of the code below took about 10 minutes.
In this code I loop over the first 10 files in a certain folder, and I insert the content of this file in a unique SQL Server Table. Once I looped over the desire files, I loop over the SQL Server table (which contain all rows of all files) and I insert the content row by row in a BigQuery Table.
This operation is very slow. Does someone have a better (faster) solution to insert the content of an SQL server Table into a BigQuery Table automatically (via a code)?
<cfsilent>
<cfinclude template="app_locals.cfm" />
<cfinclude template="act_BigqueryApiAccess.cfm" />
</cfsilent>
<!--- 1er traitement BQ: Insertion des colis traités --->
<!--- enregistrement du début du 1er traitement BQ (TShipping)--->
<cfset BigqueryTShipping_StartDate=now()>
<cfset QueryName = "InsertBigqueryLogTShippingStartDate">
<cfinclude template="qry_item.cfm">
<cfdirectory action="list" directory="#FileRoot#_data_BigqueryTShipping" listinfo="all" type="file" name="FList" sort="datelastmodified">
<cfset FileList = Valuelist(FList.name)>
<cfoutput><h3>FileList: #FileList#</h3></cfoutput>
<cfif len(trim(FileList))>
<!--- traiter les 10 derniers fichiers (les MaxNbFile moins récents) --->
<cfset FileLoop = 1>
<cfloop list="#FileList#" index="FileName">
<cfset PathFile="#FileRoot#_data_BigqueryTShipping#FileName#">
<cfset QueryName = "InsertTShipping">
<cfinclude template="qry_item.cfm">
<cfset FileLoop = FileLoop+1>
<cfif FileLoop GT Attributes.MaxNbFile>
<cfbreak />
</cfif>
</cfloop>
</cfif>
<!--- instancier un objet de type (class) TableRow --->
<cfobject action="create" type="java" class="com.google.api.services.bigquery.model.TableRow" name="row">
<!--- <cfdump var="#row#"> --->
<cfset QueryName = "GetParcels">
<cfinclude template="qry_item.cfm">
<cfloop query="GetParcels">
<cfset row.set("Tracking_Date",mid(Tracking_Date,6,19))>
<cfset row.set("TShipping_ID", TShipping_ID)>
<cfset row.set("TShipping_Tracking", TShipping_Tracking)>
<cfset row.set("Shipper_ID", Shipper_ID)>
<cfset rows.setInsertId(sys.currentTimeMillis())>
<cfset rows.setJson(row)>
<cfset rowList.add(rows)>
<cfset content=rqst.setRows(rowList)>
<cfset response = bq.tabledata().insertAll(Project_ID,Dataset_ID,Table_ID, content).execute()>
</cfloop>
<!---vider la table TShipping_BQ--->
<cfset QueryName = "DeleteOldTShipping_BQParcels">
<cfinclude template="qry_item.cfm">
<!--- Suppression des fichiers traités --->
<cfif len(trim(FileList))>
<cfset TShippingFileNb=len(trim(FileList))>
<cfset FileLoop = 1>
<cfloop list="#FileList#" index="FileName">
<cfset PathFile="#FileRoot#_data_BigqueryTShipping#FileName#">
<cffile action="move" source="#PathFile#" destination="#FileRoot#_data_BigqueryTShippingArchive">
<!--- <cffile action="delete" file="#PathFile#"> --->
<cfset FileLoop = FileLoop+1>
<cfif FileLoop GT Attributes.MaxNbFile>
<cfbreak />
</cfif>
</cfloop>
<cfelse>
<cfset TShippingFileNb=0>
</cfif>
<!--- enregistrement du nb de fichiers TShipping traités --->
<cfset QueryName = "InsertBigqueryLogTShippingNb">
<cfinclude template="qry_item.cfm">
<!--- enregistrement de la fin du 1er traitement BQ--->
<cfset BigqueryTShipping_EndDate=now()>
<cfset QueryName = "InsertBigqueryLogTShippingEndDate">
<cfinclude template="qry_item.cfm">
sql-server coldfusion cfml google-bigquery
add a comment |
up vote
3
down vote
favorite
I want to insert all rows of an SQL server Table into a BigQuery Table having the same schema.
The streaming insert row by row is very slow: to insert 1000 rows the execution of the code below took about 10 minutes.
In this code I loop over the first 10 files in a certain folder, and I insert the content of this file in a unique SQL Server Table. Once I looped over the desire files, I loop over the SQL Server table (which contain all rows of all files) and I insert the content row by row in a BigQuery Table.
This operation is very slow. Does someone have a better (faster) solution to insert the content of an SQL server Table into a BigQuery Table automatically (via a code)?
<cfsilent>
<cfinclude template="app_locals.cfm" />
<cfinclude template="act_BigqueryApiAccess.cfm" />
</cfsilent>
<!--- 1er traitement BQ: Insertion des colis traités --->
<!--- enregistrement du début du 1er traitement BQ (TShipping)--->
<cfset BigqueryTShipping_StartDate=now()>
<cfset QueryName = "InsertBigqueryLogTShippingStartDate">
<cfinclude template="qry_item.cfm">
<cfdirectory action="list" directory="#FileRoot#_data_BigqueryTShipping" listinfo="all" type="file" name="FList" sort="datelastmodified">
<cfset FileList = Valuelist(FList.name)>
<cfoutput><h3>FileList: #FileList#</h3></cfoutput>
<cfif len(trim(FileList))>
<!--- traiter les 10 derniers fichiers (les MaxNbFile moins récents) --->
<cfset FileLoop = 1>
<cfloop list="#FileList#" index="FileName">
<cfset PathFile="#FileRoot#_data_BigqueryTShipping#FileName#">
<cfset QueryName = "InsertTShipping">
<cfinclude template="qry_item.cfm">
<cfset FileLoop = FileLoop+1>
<cfif FileLoop GT Attributes.MaxNbFile>
<cfbreak />
</cfif>
</cfloop>
</cfif>
<!--- instancier un objet de type (class) TableRow --->
<cfobject action="create" type="java" class="com.google.api.services.bigquery.model.TableRow" name="row">
<!--- <cfdump var="#row#"> --->
<cfset QueryName = "GetParcels">
<cfinclude template="qry_item.cfm">
<cfloop query="GetParcels">
<cfset row.set("Tracking_Date",mid(Tracking_Date,6,19))>
<cfset row.set("TShipping_ID", TShipping_ID)>
<cfset row.set("TShipping_Tracking", TShipping_Tracking)>
<cfset row.set("Shipper_ID", Shipper_ID)>
<cfset rows.setInsertId(sys.currentTimeMillis())>
<cfset rows.setJson(row)>
<cfset rowList.add(rows)>
<cfset content=rqst.setRows(rowList)>
<cfset response = bq.tabledata().insertAll(Project_ID,Dataset_ID,Table_ID, content).execute()>
</cfloop>
<!---vider la table TShipping_BQ--->
<cfset QueryName = "DeleteOldTShipping_BQParcels">
<cfinclude template="qry_item.cfm">
<!--- Suppression des fichiers traités --->
<cfif len(trim(FileList))>
<cfset TShippingFileNb=len(trim(FileList))>
<cfset FileLoop = 1>
<cfloop list="#FileList#" index="FileName">
<cfset PathFile="#FileRoot#_data_BigqueryTShipping#FileName#">
<cffile action="move" source="#PathFile#" destination="#FileRoot#_data_BigqueryTShippingArchive">
<!--- <cffile action="delete" file="#PathFile#"> --->
<cfset FileLoop = FileLoop+1>
<cfif FileLoop GT Attributes.MaxNbFile>
<cfbreak />
</cfif>
</cfloop>
<cfelse>
<cfset TShippingFileNb=0>
</cfif>
<!--- enregistrement du nb de fichiers TShipping traités --->
<cfset QueryName = "InsertBigqueryLogTShippingNb">
<cfinclude template="qry_item.cfm">
<!--- enregistrement de la fin du 1er traitement BQ--->
<cfset BigqueryTShipping_EndDate=now()>
<cfset QueryName = "InsertBigqueryLogTShippingEndDate">
<cfinclude template="qry_item.cfm">
sql-server coldfusion cfml google-bigquery
add a comment |
up vote
3
down vote
favorite
up vote
3
down vote
favorite
I want to insert all rows of an SQL server Table into a BigQuery Table having the same schema.
The streaming insert row by row is very slow: to insert 1000 rows the execution of the code below took about 10 minutes.
In this code I loop over the first 10 files in a certain folder, and I insert the content of this file in a unique SQL Server Table. Once I looped over the desire files, I loop over the SQL Server table (which contain all rows of all files) and I insert the content row by row in a BigQuery Table.
This operation is very slow. Does someone have a better (faster) solution to insert the content of an SQL server Table into a BigQuery Table automatically (via a code)?
<cfsilent>
<cfinclude template="app_locals.cfm" />
<cfinclude template="act_BigqueryApiAccess.cfm" />
</cfsilent>
<!--- 1er traitement BQ: Insertion des colis traités --->
<!--- enregistrement du début du 1er traitement BQ (TShipping)--->
<cfset BigqueryTShipping_StartDate=now()>
<cfset QueryName = "InsertBigqueryLogTShippingStartDate">
<cfinclude template="qry_item.cfm">
<cfdirectory action="list" directory="#FileRoot#_data_BigqueryTShipping" listinfo="all" type="file" name="FList" sort="datelastmodified">
<cfset FileList = Valuelist(FList.name)>
<cfoutput><h3>FileList: #FileList#</h3></cfoutput>
<cfif len(trim(FileList))>
<!--- traiter les 10 derniers fichiers (les MaxNbFile moins récents) --->
<cfset FileLoop = 1>
<cfloop list="#FileList#" index="FileName">
<cfset PathFile="#FileRoot#_data_BigqueryTShipping#FileName#">
<cfset QueryName = "InsertTShipping">
<cfinclude template="qry_item.cfm">
<cfset FileLoop = FileLoop+1>
<cfif FileLoop GT Attributes.MaxNbFile>
<cfbreak />
</cfif>
</cfloop>
</cfif>
<!--- instancier un objet de type (class) TableRow --->
<cfobject action="create" type="java" class="com.google.api.services.bigquery.model.TableRow" name="row">
<!--- <cfdump var="#row#"> --->
<cfset QueryName = "GetParcels">
<cfinclude template="qry_item.cfm">
<cfloop query="GetParcels">
<cfset row.set("Tracking_Date",mid(Tracking_Date,6,19))>
<cfset row.set("TShipping_ID", TShipping_ID)>
<cfset row.set("TShipping_Tracking", TShipping_Tracking)>
<cfset row.set("Shipper_ID", Shipper_ID)>
<cfset rows.setInsertId(sys.currentTimeMillis())>
<cfset rows.setJson(row)>
<cfset rowList.add(rows)>
<cfset content=rqst.setRows(rowList)>
<cfset response = bq.tabledata().insertAll(Project_ID,Dataset_ID,Table_ID, content).execute()>
</cfloop>
<!---vider la table TShipping_BQ--->
<cfset QueryName = "DeleteOldTShipping_BQParcels">
<cfinclude template="qry_item.cfm">
<!--- Suppression des fichiers traités --->
<cfif len(trim(FileList))>
<cfset TShippingFileNb=len(trim(FileList))>
<cfset FileLoop = 1>
<cfloop list="#FileList#" index="FileName">
<cfset PathFile="#FileRoot#_data_BigqueryTShipping#FileName#">
<cffile action="move" source="#PathFile#" destination="#FileRoot#_data_BigqueryTShippingArchive">
<!--- <cffile action="delete" file="#PathFile#"> --->
<cfset FileLoop = FileLoop+1>
<cfif FileLoop GT Attributes.MaxNbFile>
<cfbreak />
</cfif>
</cfloop>
<cfelse>
<cfset TShippingFileNb=0>
</cfif>
<!--- enregistrement du nb de fichiers TShipping traités --->
<cfset QueryName = "InsertBigqueryLogTShippingNb">
<cfinclude template="qry_item.cfm">
<!--- enregistrement de la fin du 1er traitement BQ--->
<cfset BigqueryTShipping_EndDate=now()>
<cfset QueryName = "InsertBigqueryLogTShippingEndDate">
<cfinclude template="qry_item.cfm">
sql-server coldfusion cfml google-bigquery
I want to insert all rows of an SQL server Table into a BigQuery Table having the same schema.
The streaming insert row by row is very slow: to insert 1000 rows the execution of the code below took about 10 minutes.
In this code I loop over the first 10 files in a certain folder, and I insert the content of this file in a unique SQL Server Table. Once I looped over the desire files, I loop over the SQL Server table (which contain all rows of all files) and I insert the content row by row in a BigQuery Table.
This operation is very slow. Does someone have a better (faster) solution to insert the content of an SQL server Table into a BigQuery Table automatically (via a code)?
<cfsilent>
<cfinclude template="app_locals.cfm" />
<cfinclude template="act_BigqueryApiAccess.cfm" />
</cfsilent>
<!--- 1er traitement BQ: Insertion des colis traités --->
<!--- enregistrement du début du 1er traitement BQ (TShipping)--->
<cfset BigqueryTShipping_StartDate=now()>
<cfset QueryName = "InsertBigqueryLogTShippingStartDate">
<cfinclude template="qry_item.cfm">
<cfdirectory action="list" directory="#FileRoot#_data_BigqueryTShipping" listinfo="all" type="file" name="FList" sort="datelastmodified">
<cfset FileList = Valuelist(FList.name)>
<cfoutput><h3>FileList: #FileList#</h3></cfoutput>
<cfif len(trim(FileList))>
<!--- traiter les 10 derniers fichiers (les MaxNbFile moins récents) --->
<cfset FileLoop = 1>
<cfloop list="#FileList#" index="FileName">
<cfset PathFile="#FileRoot#_data_BigqueryTShipping#FileName#">
<cfset QueryName = "InsertTShipping">
<cfinclude template="qry_item.cfm">
<cfset FileLoop = FileLoop+1>
<cfif FileLoop GT Attributes.MaxNbFile>
<cfbreak />
</cfif>
</cfloop>
</cfif>
<!--- instancier un objet de type (class) TableRow --->
<cfobject action="create" type="java" class="com.google.api.services.bigquery.model.TableRow" name="row">
<!--- <cfdump var="#row#"> --->
<cfset QueryName = "GetParcels">
<cfinclude template="qry_item.cfm">
<cfloop query="GetParcels">
<cfset row.set("Tracking_Date",mid(Tracking_Date,6,19))>
<cfset row.set("TShipping_ID", TShipping_ID)>
<cfset row.set("TShipping_Tracking", TShipping_Tracking)>
<cfset row.set("Shipper_ID", Shipper_ID)>
<cfset rows.setInsertId(sys.currentTimeMillis())>
<cfset rows.setJson(row)>
<cfset rowList.add(rows)>
<cfset content=rqst.setRows(rowList)>
<cfset response = bq.tabledata().insertAll(Project_ID,Dataset_ID,Table_ID, content).execute()>
</cfloop>
<!---vider la table TShipping_BQ--->
<cfset QueryName = "DeleteOldTShipping_BQParcels">
<cfinclude template="qry_item.cfm">
<!--- Suppression des fichiers traités --->
<cfif len(trim(FileList))>
<cfset TShippingFileNb=len(trim(FileList))>
<cfset FileLoop = 1>
<cfloop list="#FileList#" index="FileName">
<cfset PathFile="#FileRoot#_data_BigqueryTShipping#FileName#">
<cffile action="move" source="#PathFile#" destination="#FileRoot#_data_BigqueryTShippingArchive">
<!--- <cffile action="delete" file="#PathFile#"> --->
<cfset FileLoop = FileLoop+1>
<cfif FileLoop GT Attributes.MaxNbFile>
<cfbreak />
</cfif>
</cfloop>
<cfelse>
<cfset TShippingFileNb=0>
</cfif>
<!--- enregistrement du nb de fichiers TShipping traités --->
<cfset QueryName = "InsertBigqueryLogTShippingNb">
<cfinclude template="qry_item.cfm">
<!--- enregistrement de la fin du 1er traitement BQ--->
<cfset BigqueryTShipping_EndDate=now()>
<cfset QueryName = "InsertBigqueryLogTShippingEndDate">
<cfinclude template="qry_item.cfm">
sql-server coldfusion cfml google-bigquery
sql-server coldfusion cfml google-bigquery
edited 38 mins ago
200_success
127k15149412
127k15149412
asked May 27 '14 at 14:37
user3569267
1261
1261
add a comment |
add a comment |
1 Answer
1
active
oldest
votes
up vote
3
down vote
Wrong Approach
I think what you are trying to do makes sense, however I think that your row-by-row approach is what is making this so slow. Anything you do row-by-row with SQL (e.g., a cursor or loop) will be slow because you are executing the whole section of query anew for each row. All SQL is optimized to work with large data sets, not single rows.
Suggested Approach
I feel sure that you could establish a connection so the BigQuery server directly and pass it sets of data via sp_addlinkedserver or similar approach. Contact the people at BigQuery to help with this.
Do scan through your files to insert the data into your local SQL server. Then you could just do something like:
INSERT INTO [BigQueryServer].[database].[schema].[table]
SELECT * FROM [LocalServer].[database].[schema].[table]
WHERE [LocalServer].[database].[schema].[table].[added_timestamp] -- or whatever column you use to keep track of records added
>= '2014-07-01' -- or whatever date
add a comment |
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
up vote
3
down vote
Wrong Approach
I think what you are trying to do makes sense, however I think that your row-by-row approach is what is making this so slow. Anything you do row-by-row with SQL (e.g., a cursor or loop) will be slow because you are executing the whole section of query anew for each row. All SQL is optimized to work with large data sets, not single rows.
Suggested Approach
I feel sure that you could establish a connection so the BigQuery server directly and pass it sets of data via sp_addlinkedserver or similar approach. Contact the people at BigQuery to help with this.
Do scan through your files to insert the data into your local SQL server. Then you could just do something like:
INSERT INTO [BigQueryServer].[database].[schema].[table]
SELECT * FROM [LocalServer].[database].[schema].[table]
WHERE [LocalServer].[database].[schema].[table].[added_timestamp] -- or whatever column you use to keep track of records added
>= '2014-07-01' -- or whatever date
add a comment |
up vote
3
down vote
Wrong Approach
I think what you are trying to do makes sense, however I think that your row-by-row approach is what is making this so slow. Anything you do row-by-row with SQL (e.g., a cursor or loop) will be slow because you are executing the whole section of query anew for each row. All SQL is optimized to work with large data sets, not single rows.
Suggested Approach
I feel sure that you could establish a connection so the BigQuery server directly and pass it sets of data via sp_addlinkedserver or similar approach. Contact the people at BigQuery to help with this.
Do scan through your files to insert the data into your local SQL server. Then you could just do something like:
INSERT INTO [BigQueryServer].[database].[schema].[table]
SELECT * FROM [LocalServer].[database].[schema].[table]
WHERE [LocalServer].[database].[schema].[table].[added_timestamp] -- or whatever column you use to keep track of records added
>= '2014-07-01' -- or whatever date
add a comment |
up vote
3
down vote
up vote
3
down vote
Wrong Approach
I think what you are trying to do makes sense, however I think that your row-by-row approach is what is making this so slow. Anything you do row-by-row with SQL (e.g., a cursor or loop) will be slow because you are executing the whole section of query anew for each row. All SQL is optimized to work with large data sets, not single rows.
Suggested Approach
I feel sure that you could establish a connection so the BigQuery server directly and pass it sets of data via sp_addlinkedserver or similar approach. Contact the people at BigQuery to help with this.
Do scan through your files to insert the data into your local SQL server. Then you could just do something like:
INSERT INTO [BigQueryServer].[database].[schema].[table]
SELECT * FROM [LocalServer].[database].[schema].[table]
WHERE [LocalServer].[database].[schema].[table].[added_timestamp] -- or whatever column you use to keep track of records added
>= '2014-07-01' -- or whatever date
Wrong Approach
I think what you are trying to do makes sense, however I think that your row-by-row approach is what is making this so slow. Anything you do row-by-row with SQL (e.g., a cursor or loop) will be slow because you are executing the whole section of query anew for each row. All SQL is optimized to work with large data sets, not single rows.
Suggested Approach
I feel sure that you could establish a connection so the BigQuery server directly and pass it sets of data via sp_addlinkedserver or similar approach. Contact the people at BigQuery to help with this.
Do scan through your files to insert the data into your local SQL server. Then you could just do something like:
INSERT INTO [BigQueryServer].[database].[schema].[table]
SELECT * FROM [LocalServer].[database].[schema].[table]
WHERE [LocalServer].[database].[schema].[table].[added_timestamp] -- or whatever column you use to keep track of records added
>= '2014-07-01' -- or whatever date
answered Jul 23 '14 at 21:07
Phrancis
14.7k646139
14.7k646139
add a comment |
add a comment |
Thanks for contributing an answer to Code Review Stack Exchange!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
Use MathJax to format equations. MathJax reference.
To learn more, see our tips on writing great answers.
Some of your past answers have not been well-received, and you're in danger of being blocked from answering.
Please pay close attention to the following guidance:
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f51828%2finsert-an-sql-server-table-rows-into-a-bigquery-table-in-one-bloc%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown