Loading Data to Project
Loading a Data Set to Project
To load data into a project, you have to have 3 things. Blueprint, project and data.
# encoding: utf-8
require 'gooddata'
GoodData.with_connection do |client|
blueprint = GoodData::Model::ProjectBlueprint.build('Acme project') do |p|
p.add_date_dimension('committed_on')
p.add_dataset('dataset.commits') do |d|
d.add_anchor('attr.commits.id')
d.add_fact('fact.commits.lines_changed')
d.add_attribute('attr.commits.name')
d.add_label('label.commits.name', reference: 'attr.commits.name')
d.add_date('committed_on', :format => 'dd/MM/yyyy')
end
end
project = client.create_project_from_blueprint(blueprint, auth_token: 'TOKEN')
# By default names of the columns are the identifiers of the labels, facts, or names of references
data = [
['fact.commits.lines_changed', 'label.commits.name', 'committed_on'],
[1, 'tomas', '01/01/2001'],
[1, 'petr', '01/12/2001'],
[1, 'jirka', '24/12/2014']]
project.upload(data, blueprint, 'dataset.commits')
# If the column names in your data do not match the GoodData references, you can easily supply the desired mapping in the :column_mapping parameter
data = [
['lines', 'committer', 'date'],
[1, 'tomas', '01/01/2001'],
[1, 'petr', '01/12/2001'],
[1, 'jirka', '24/12/2014']]
column_mapping = {
"fact.commits.lines_changed": 'lines',
"label.commits.name": 'committer',
"committed_on": 'date'
}
project.upload(data, blueprint, 'dataset.commits', column_mapping: column_mapping)
# Now the data are loaded in. You can easily compute some number
project.facts.first.create_metric(type: :sum).execute # => 3
# By default data are loaded in full mode. This means that data override all previous data in the dataset
data = [
['fact.commits.lines_changed', 'label.commits.name', 'committed_on'],
[10, 'tomas', '01/01/2001'],
[10, 'petr', '01/12/2001'],
[10, 'jirka', '24/12/2014']]
project.upload(data, blueprint, 'dataset.commits')
project.facts.first.create_metric(type: :sum).execute # => 30
# You can also load more data through INCREMENTAL mode
project.upload(data, blueprint, 'dataset.commits', :mode => 'INCREMENTAL')
project.facts.first.create_metric(type: :sum).execute # => 60
# If you want to you can also specify what the names of the columns in the CSV is going to be
blueprint = GoodData::Model::ProjectBlueprint.build('Acme project') do |p|
p.add_date_dimension('committed_on')
p.add_dataset('dataset.commits') do |d|
d.add_anchor('attr.commits.id')
d.add_fact('fact.commits.lines_changed', column_name: 'fact')
d.add_attribute('attr.commits.name')
d.add_label('label.commits.name', reference: 'attr.commits.name', column_name: 'label' )
d.add_date('committed_on', :format => 'dd/MM/yyyy', column_name: 'ref')
end
end
data = [
['fact', 'label', 'ref'],
[10, 'tomas', '01/01/2001'],
[10, 'petr', '01/12/2001'],
[10, 'jirka', '24/12/2014']]
project.upload(data, blueprint, 'dataset.commits')
end
Loading Multiple Data Sets to Project
You can load multiple data sets into project at once.
The GoodData platform supports loading multiple datasets from a set of CSV files in a single task. In addition to loading a single CSV at a time, you can now upload your CSV files, provide a JSON manifest file, and then execute the data load through a single API call. This method is particularly useful if your project contains many datasets, or if you are loading multiple datasets with larger data volumes. The multiple datasets processing is significantly faster in these situations.
For more info see GoodData Article.
# encoding: utf-8
require 'gooddata'
require 'csv'
USERNAME = 'YOUR_USERNAME'
PASSWORD = 'YOUR_PASSWORD'
TOKEN = 'YOUR_TOKEN'
GoodData.with_connection(USERNAME, PASSWORD) do |client|
# Create LDM blueprint
blueprint = GoodData::Model::ProjectBlueprint.from_json('data/hr_manifest.json')
# Create new project (datamart)
project = GoodData::Project.create_from_blueprint(blueprint, auth_token: TOKEN)
puts "Created project #{project.pid}"
data = [
{
data: 'data/hr_departments.csv',
dataset: 'dataset.department',
},
{
data: 'data/hr_employees.csv',
dataset: 'dataset.employee'
},
{
data: 'data/hr_salaries.csv',
dataset: 'dataset.salary',
options: {:mode => 'INCREMENTAL'}
}
]
res = project.upload_multiple(data, blueprint)
puts JSON.pretty_generate(res)
puts 'Done!'
end
Loading Data with Specific Date Format
You can specify a date loading format in your blueprint. If you do not specify any format then the default MM/dd/yyyy format is used
# encoding: utf-8
require 'gooddata'
GoodData.with_connection do |client|
blueprint = GoodData::Model::ProjectBlueprint.build('Acme project') do |p|
p.add_date_dimension('committed_on')
p.add_dataset('dataset.commits') do |d|
d.add_anchor('attr.commits.id')
d.add_fact('fact.commits.lines_changed')
d.add_attribute('attr.commits.name')
d.add_label('label.commits.name', reference: 'attr.commits.name')
d.add_date('committed_on', :dataset => 'committed_on')
end
end
project = client.create_project_from_blueprint(blueprint, auth_token: 'token')
# By default the dates are expected in format MM/dd/yyyy
data = [
['fact.commits.lines_changed', 'label.commits.name', 'committed_on'],
[1, 'tomas', '01/01/2001'],
[1, 'petr', '12/01/2001'],
[1, 'jirka', '12/24/2014']]
project.upload(data, blueprint, 'dataset.commits')
puts project.compute_report(top: [project.facts.first.create_metric], left: ['committed_on.date'])
# prints
#
# [01/01/2001 | 3.0]
# [12/01/2001 | 3.0]
# [12/24/2014 | 3.0]
# if you try to load a differen format it will fail
data = [['fact.commits.lines_changed', 'label.commits.name', 'committed_on'],
[1, 'tomas', '2001-01-01']]
project.upload(data, blueprint, 'dataset.commits')
# You can specify a different date format
blueprint = GoodData::Model::ProjectBlueprint.build('Acme project') do |p|
p.add_date_dimension('committed_on')
p.add_dataset('dataset.commits') do |d|
d.add_anchor('attr.commits.id')
d.add_fact('fact.commits.lines_changed')
d.add_attribute('attr.commits.name')
d.add_label('label.commits.name', reference: 'attr.commits.name')
d.add_date('committed_on', :dataset => 'committed_on', format: 'yyyy-dd-MM')
end
end
data = [
['fact.commits.lines_changed', 'label.commits.name', 'committed_on'],
[3, 'tomas', '2001-01-01'],
[3, 'petr', '2001-01-12'],
[3, 'jirka', '2014-24-12']]
project.upload(data, blueprint, 'dataset.commits')
puts project.compute_report(top: [project.facts.first.create_metric], left: ['committed_on.date'])
# prints
#
# [01/01/2001 | 3.0]
# [12/01/2001 | 3.0]
# [12/24/2014 | 3.0]
end
Note couple of things We did not have to update the project to be able
to load dates in a different format. Date format information is used
only during the data upload and the model is unaffected. This is
something to think about when you are inferring the blueprint from the
model using project.blueprint
. This information is not persisted in
the project.