Skip to content

Module extract_load.brocolib_extract_load.ingest

View Source
import pandas as pd

import requests as rq

def extract(url, source_type, params={}, nested_key=None):

  '''

  Function to extract data

    - read json from url

    - convert json to dataframe

  Parameters:

    url (str): url of the data source

    source_type (str): type of the data to fetch

    params (dict) : request parameters

  Returns:

     (pandas.DataFrame): Dataframe created from source

  Exceptions:

      NotImplementedError: if the source type is not implemented

  '''

  source_type = source_type.lower()

  if source_type == 'json':

    if nested_key:

      response = rq.get(url, params=params)

      data = response.json()[nested_key]

      return pd.DataFrame(data)

    else:

      response = rq.get(url, params=params)

      data = response.json()

      return pd.DataFrame(data)

  else:

    raise NotImplementedError("sources available: json")

Functions

extract

def extract(
    url,
    source_type,
    params={},
    nested_key=None
)

Function to extract data

  • read json from url
  • convert json to dataframe

Parameters:

Name Type Description Default
url str url of the data source None
source_type str type of the data to fetch None
params dict request parameters None

Returns:

Type Description
(pandas.DataFrame) Dataframe created from source

Raises:

Type Description
NotImplementedError if the source type is not implemented
View Source
def extract(url, source_type, params={}, nested_key=None):

  '''

  Function to extract data

    - read json from url

    - convert json to dataframe

  Parameters:

    url (str): url of the data source

    source_type (str): type of the data to fetch

    params (dict) : request parameters

  Returns:

     (pandas.DataFrame): Dataframe created from source

  Exceptions:

      NotImplementedError: if the source type is not implemented

  '''

  source_type = source_type.lower()

  if source_type == 'json':

    if nested_key:

      response = rq.get(url, params=params)

      data = response.json()[nested_key]

      return pd.DataFrame(data)

    else:

      response = rq.get(url, params=params)

      data = response.json()

      return pd.DataFrame(data)

  else:

    raise NotImplementedError("sources available: json")