Create single row dataframe from list of list PySpark
I have a data like this data = [[1.1, 1.2], [1.3, 1.4], [1.5, 1.6]]
I want to create a PySpark dataframe
I already use
dataframe = SQLContext.createDataFrame(data, ['features'])
but I always get
+--------+---+
|features| _2|
+--------+---+
| 1.1|1.2|
| 1.3|1.4|
| 1.5|1.6|
+--------+---+
how can I get result like below?
+----------+
|features |
+----------+
|[1.1, 1.2]|
|[1.3, 1.4]|
|[1.5, 1.6]|
+----------+
python apache-spark pyspark apache-spark-sql
add a comment |
I have a data like this data = [[1.1, 1.2], [1.3, 1.4], [1.5, 1.6]]
I want to create a PySpark dataframe
I already use
dataframe = SQLContext.createDataFrame(data, ['features'])
but I always get
+--------+---+
|features| _2|
+--------+---+
| 1.1|1.2|
| 1.3|1.4|
| 1.5|1.6|
+--------+---+
how can I get result like below?
+----------+
|features |
+----------+
|[1.1, 1.2]|
|[1.3, 1.4]|
|[1.5, 1.6]|
+----------+
python apache-spark pyspark apache-spark-sql
You can create a schema and provide while creating a dataframe
– Shankar Koirala
Feb 12 '18 at 11:11
add a comment |
I have a data like this data = [[1.1, 1.2], [1.3, 1.4], [1.5, 1.6]]
I want to create a PySpark dataframe
I already use
dataframe = SQLContext.createDataFrame(data, ['features'])
but I always get
+--------+---+
|features| _2|
+--------+---+
| 1.1|1.2|
| 1.3|1.4|
| 1.5|1.6|
+--------+---+
how can I get result like below?
+----------+
|features |
+----------+
|[1.1, 1.2]|
|[1.3, 1.4]|
|[1.5, 1.6]|
+----------+
python apache-spark pyspark apache-spark-sql
I have a data like this data = [[1.1, 1.2], [1.3, 1.4], [1.5, 1.6]]
I want to create a PySpark dataframe
I already use
dataframe = SQLContext.createDataFrame(data, ['features'])
but I always get
+--------+---+
|features| _2|
+--------+---+
| 1.1|1.2|
| 1.3|1.4|
| 1.5|1.6|
+--------+---+
how can I get result like below?
+----------+
|features |
+----------+
|[1.1, 1.2]|
|[1.3, 1.4]|
|[1.5, 1.6]|
+----------+
python apache-spark pyspark apache-spark-sql
python apache-spark pyspark apache-spark-sql
edited Jan 14 at 12:54
user6910411
33.2k97398
33.2k97398
asked Feb 12 '18 at 11:08
Yanfa Adi PutraYanfa Adi Putra
629
629
You can create a schema and provide while creating a dataframe
– Shankar Koirala
Feb 12 '18 at 11:11
add a comment |
You can create a schema and provide while creating a dataframe
– Shankar Koirala
Feb 12 '18 at 11:11
You can create a schema and provide while creating a dataframe
– Shankar Koirala
Feb 12 '18 at 11:11
You can create a schema and provide while creating a dataframe
– Shankar Koirala
Feb 12 '18 at 11:11
add a comment |
3 Answers
3
active
oldest
votes
I find it's useful to think of the argument to createDataFrame()
as a list of tuples where each entry in the list corresponds to a row in the DataFrame and each element of the tuple corresponds to a column.
You can get your desired output by making each element in the list a tuple:
data = [([1.1, 1.2],), ([1.3, 1.4],), ([1.5, 1.6],)]
dataframe = sqlCtx.createDataFrame(data, ['features'])
dataframe.show()
#+----------+
#| features|
#+----------+
#|[1.1, 1.2]|
#|[1.3, 1.4]|
#|[1.5, 1.6]|
#+----------+
Or if changing the source is cumbersome, you can equivalently do:
data = [[1.1, 1.2], [1.3, 1.4], [1.5, 1.6]]
dataframe = sqlCtx.createDataFrame(map(lambda x: (x, ), data), ['features'])
dataframe.show()
#+----------+
#| features|
#+----------+
#|[1.1, 1.2]|
#|[1.3, 1.4]|
#|[1.5, 1.6]|
#+----------+
add a comment |
You need a map
function to convert the tuples
to array
and use it in createDataFrame
dataframe = sqlContext.createDataFrame(sc.parallelize(data).map(lambda x: [x]), ['features'])
You should get as you desire
+----------+
| features|
+----------+
|[1.1, 1.2]|
|[1.3, 1.4]|
|[1.5, 1.6]|
+----------+
add a comment |
You should use the Vector Assembler function, from your code I guess you are doing this to train a machine learning model, and vector assembler works the best for that case. You can also add the assembler in the pipeline.
assemble_feature=VectorAssembler(inputCol=data.columns,outputCol='features')
pipeline=Pipeline(stages=[assemble_feature])
pipeline.fit(data).transform(data)
add a comment |
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f48745029%2fcreate-single-row-dataframe-from-list-of-list-pyspark%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
3 Answers
3
active
oldest
votes
3 Answers
3
active
oldest
votes
active
oldest
votes
active
oldest
votes
I find it's useful to think of the argument to createDataFrame()
as a list of tuples where each entry in the list corresponds to a row in the DataFrame and each element of the tuple corresponds to a column.
You can get your desired output by making each element in the list a tuple:
data = [([1.1, 1.2],), ([1.3, 1.4],), ([1.5, 1.6],)]
dataframe = sqlCtx.createDataFrame(data, ['features'])
dataframe.show()
#+----------+
#| features|
#+----------+
#|[1.1, 1.2]|
#|[1.3, 1.4]|
#|[1.5, 1.6]|
#+----------+
Or if changing the source is cumbersome, you can equivalently do:
data = [[1.1, 1.2], [1.3, 1.4], [1.5, 1.6]]
dataframe = sqlCtx.createDataFrame(map(lambda x: (x, ), data), ['features'])
dataframe.show()
#+----------+
#| features|
#+----------+
#|[1.1, 1.2]|
#|[1.3, 1.4]|
#|[1.5, 1.6]|
#+----------+
add a comment |
I find it's useful to think of the argument to createDataFrame()
as a list of tuples where each entry in the list corresponds to a row in the DataFrame and each element of the tuple corresponds to a column.
You can get your desired output by making each element in the list a tuple:
data = [([1.1, 1.2],), ([1.3, 1.4],), ([1.5, 1.6],)]
dataframe = sqlCtx.createDataFrame(data, ['features'])
dataframe.show()
#+----------+
#| features|
#+----------+
#|[1.1, 1.2]|
#|[1.3, 1.4]|
#|[1.5, 1.6]|
#+----------+
Or if changing the source is cumbersome, you can equivalently do:
data = [[1.1, 1.2], [1.3, 1.4], [1.5, 1.6]]
dataframe = sqlCtx.createDataFrame(map(lambda x: (x, ), data), ['features'])
dataframe.show()
#+----------+
#| features|
#+----------+
#|[1.1, 1.2]|
#|[1.3, 1.4]|
#|[1.5, 1.6]|
#+----------+
add a comment |
I find it's useful to think of the argument to createDataFrame()
as a list of tuples where each entry in the list corresponds to a row in the DataFrame and each element of the tuple corresponds to a column.
You can get your desired output by making each element in the list a tuple:
data = [([1.1, 1.2],), ([1.3, 1.4],), ([1.5, 1.6],)]
dataframe = sqlCtx.createDataFrame(data, ['features'])
dataframe.show()
#+----------+
#| features|
#+----------+
#|[1.1, 1.2]|
#|[1.3, 1.4]|
#|[1.5, 1.6]|
#+----------+
Or if changing the source is cumbersome, you can equivalently do:
data = [[1.1, 1.2], [1.3, 1.4], [1.5, 1.6]]
dataframe = sqlCtx.createDataFrame(map(lambda x: (x, ), data), ['features'])
dataframe.show()
#+----------+
#| features|
#+----------+
#|[1.1, 1.2]|
#|[1.3, 1.4]|
#|[1.5, 1.6]|
#+----------+
I find it's useful to think of the argument to createDataFrame()
as a list of tuples where each entry in the list corresponds to a row in the DataFrame and each element of the tuple corresponds to a column.
You can get your desired output by making each element in the list a tuple:
data = [([1.1, 1.2],), ([1.3, 1.4],), ([1.5, 1.6],)]
dataframe = sqlCtx.createDataFrame(data, ['features'])
dataframe.show()
#+----------+
#| features|
#+----------+
#|[1.1, 1.2]|
#|[1.3, 1.4]|
#|[1.5, 1.6]|
#+----------+
Or if changing the source is cumbersome, you can equivalently do:
data = [[1.1, 1.2], [1.3, 1.4], [1.5, 1.6]]
dataframe = sqlCtx.createDataFrame(map(lambda x: (x, ), data), ['features'])
dataframe.show()
#+----------+
#| features|
#+----------+
#|[1.1, 1.2]|
#|[1.3, 1.4]|
#|[1.5, 1.6]|
#+----------+
edited Feb 12 '18 at 17:04
answered Feb 12 '18 at 16:19
paultpault
14.8k32249
14.8k32249
add a comment |
add a comment |
You need a map
function to convert the tuples
to array
and use it in createDataFrame
dataframe = sqlContext.createDataFrame(sc.parallelize(data).map(lambda x: [x]), ['features'])
You should get as you desire
+----------+
| features|
+----------+
|[1.1, 1.2]|
|[1.3, 1.4]|
|[1.5, 1.6]|
+----------+
add a comment |
You need a map
function to convert the tuples
to array
and use it in createDataFrame
dataframe = sqlContext.createDataFrame(sc.parallelize(data).map(lambda x: [x]), ['features'])
You should get as you desire
+----------+
| features|
+----------+
|[1.1, 1.2]|
|[1.3, 1.4]|
|[1.5, 1.6]|
+----------+
add a comment |
You need a map
function to convert the tuples
to array
and use it in createDataFrame
dataframe = sqlContext.createDataFrame(sc.parallelize(data).map(lambda x: [x]), ['features'])
You should get as you desire
+----------+
| features|
+----------+
|[1.1, 1.2]|
|[1.3, 1.4]|
|[1.5, 1.6]|
+----------+
You need a map
function to convert the tuples
to array
and use it in createDataFrame
dataframe = sqlContext.createDataFrame(sc.parallelize(data).map(lambda x: [x]), ['features'])
You should get as you desire
+----------+
| features|
+----------+
|[1.1, 1.2]|
|[1.3, 1.4]|
|[1.5, 1.6]|
+----------+
answered Feb 12 '18 at 11:23
Ramesh MaharjanRamesh Maharjan
27.1k52046
27.1k52046
add a comment |
add a comment |
You should use the Vector Assembler function, from your code I guess you are doing this to train a machine learning model, and vector assembler works the best for that case. You can also add the assembler in the pipeline.
assemble_feature=VectorAssembler(inputCol=data.columns,outputCol='features')
pipeline=Pipeline(stages=[assemble_feature])
pipeline.fit(data).transform(data)
add a comment |
You should use the Vector Assembler function, from your code I guess you are doing this to train a machine learning model, and vector assembler works the best for that case. You can also add the assembler in the pipeline.
assemble_feature=VectorAssembler(inputCol=data.columns,outputCol='features')
pipeline=Pipeline(stages=[assemble_feature])
pipeline.fit(data).transform(data)
add a comment |
You should use the Vector Assembler function, from your code I guess you are doing this to train a machine learning model, and vector assembler works the best for that case. You can also add the assembler in the pipeline.
assemble_feature=VectorAssembler(inputCol=data.columns,outputCol='features')
pipeline=Pipeline(stages=[assemble_feature])
pipeline.fit(data).transform(data)
You should use the Vector Assembler function, from your code I guess you are doing this to train a machine learning model, and vector assembler works the best for that case. You can also add the assembler in the pipeline.
assemble_feature=VectorAssembler(inputCol=data.columns,outputCol='features')
pipeline=Pipeline(stages=[assemble_feature])
pipeline.fit(data).transform(data)
answered Feb 12 '18 at 12:04
pratiklodhapratiklodha
686718
686718
add a comment |
add a comment |
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f48745029%2fcreate-single-row-dataframe-from-list-of-list-pyspark%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
You can create a schema and provide while creating a dataframe
– Shankar Koirala
Feb 12 '18 at 11:11