ssl-hep
diff --git a/‎.vscode/settings.json
Lines changed: 3 additions & 4 deletions b/‎.vscode/settings.json
Lines changed: 3 additions & 4 deletions
diff --git a/‎README.md
Lines changed: 2 additions & 1 deletion b/‎README.md
Lines changed: 2 additions & 1 deletion
diff --git a/‎servicex/config_default.yaml
Lines changed: 13 additions & 0 deletions b/‎servicex/config_default.yaml
Lines changed: 13 additions & 0 deletions
diff --git a/‎servicex/data_conversions.py
Lines changed: 142 additions & 44 deletions b/‎servicex/data_conversions.py
Lines changed: 142 additions & 44 deletions
diff --git a/‎servicex/servicex.py
Lines changed: 18 additions & 10 deletions b/‎servicex/servicex.py
Lines changed: 18 additions & 10 deletions
@@ -25,6 +25,7 @@
         "aexit",
         "aiohttp",
         "asyncio",
+        "backends",
         "cacheme",
         "codecov",
         "dcache",
@@ -54,6 +55,7 @@
         "prereleased",
         "protomolecule",
         "ptetaphi",
+        "pyarrow",
         "pypa",
         "pypi",
         "pytest",
@@ -75,8 +77,5 @@
         "xaod",
         "xrootd"
     ],
-    "python.analysis.typeCheckingMode": "basic",
-    "python.testing.pytestArgs": [
-        "--no-cov"
-    ]
+    "python.analysis.typeCheckingMode": "basic"
 }
@@ -39,7 +39,7 @@ Create a `.servicex` file, in the `yaml` format, in the appropriate place for yo
 
 ```yaml
 api_endpoints:
-  - endpoint: <your-endpoint>
+  - endpoint: <your-endpoint-url>
     email: <api-email>
     password: <api-password>
     type: xaod
@@ -105,6 +105,7 @@ The file can contain an `api_endpoint` as mentioned above. In addition the other
   - Linux: `cache_path: "/home/servicex-cache"`
 
 - `minio_endpoint`, `minio_username`, `minio_password` - these are only interesting if you are using a pre-RC2 release of `servicex` - when the `minio` information wasn't part of the API exchange. This feature is depreciated and will be removed around the time `servicex` moves to RC3.
+- `backend_types` - a list of yaml dictionaries that contains some defaults for the backends. By default only the `return_data` is there, which for `xaod` is `root` and `uproot` is `parquet`. Allows `servicex` to convert to `pandas.DataFrame` or `awkward` if requested by the user.
 
 All strings are expanded using python's [os.path.expand](https://docs.python.org/3/library/os.path.html#os.path.expandvars) method - so `$NAME` and `${NAME}` will work to expand existing environment variables.
 
 
@@ -23,3 +23,16 @@ cache_path: /tmp/servicex
 # This is a dummy value, here only to make sure that unit testing
 # works properly before package release.
 testing_value: 10
+
+# If we can't figure out what backend the user is going to use, we
+# return this sort of file. Parquet for the uproot backend, and root for the
+# xaod backend.
+default_return_data: parquet
+
+# Defaults for the various types of servicex backends that we might deal with.
+# Easy enough to add a new one here...
+backend_types:
+  - type: xaod
+    return_data: root
+  - type: uproot
+    return_data: parquet
@@ -1,73 +1,171 @@
 from pathlib import Path
 from concurrent.futures import ThreadPoolExecutor
 import asyncio
-from typing import Dict, Union
+from servicex.utils import ServiceXException
+from typing import Dict, Optional, Union
 
-_conversion_pool = ThreadPoolExecutor(2)
+from awkward.array.chunked import ChunkedArray
 
+_conversion_pool = ThreadPoolExecutor(4)
 
-async def _convert_root_to_pandas(file: Path):
+
+class DataConverterAdaptor:
+    '''Methods to convert from one type of data to the other.
     '''
-    Convert the contents of a ROOT file to pandas.
+    def __init__(self, default_file_type: str):
+        '''Create a data converter adaptor. By default it will do the
+        conversation as requested.
 
-    Arguments:
+        Args:
+            default_file_type (str): The default file type (`parquet` or `root`)
+        '''
+        self._default_file_type = default_file_type
 
-        file        A `Path` to the file containing the pandas data
+    async def convert_to_pandas(self, file: Path, file_type: Optional[str] = None):
+        '''Convert to a pandas dataframe from data stored in a file of a particular file_type
 
-    Returns:
+        Args:
+            file (Path): Path to the file
+            file_type (str): What the file contains (root, parquet, etc)
+        '''
+        file_type = file_type if file_type is not None else self._default_file_type
+        if file_type == 'root':
+            return await self._convert_root_to_pandas(file)
+        elif file_type == 'parquet':
+            return await self._convert_parquet_to_pandas(file)
+        else:
+            raise ServiceXException(f'Conversion from {file_type} into an pandas DF is not '
+                                    'yet supported')
 
-        DataFrame   A pandas dataframe
+    async def convert_to_awkward(self, file: Path, file_type: Optional[str] = None):
+        '''Convert to an awkward data array from data stored in a file of a particular file_type
 
-    Note:
+        Args:
+            file (Path): Path to the file
+            file_type (str): What the file contains (root, parquet, etc)
+        '''
+        file_type = file_type if file_type is not None else self._default_file_type
+        if file_type == 'root':
+            return await self._convert_root_to_awkward(file)
+        elif file_type == 'parquet':
+            return await self._convert_parquet_to_awkward(file)
+        else:
+            raise ServiceXException(f'Conversion from {file_type} into an awkward array is not '
+                                    'yet supported')
 
-        - Work is done on a second thread.
-        - Pandas is only imported if this is called.
+    async def _convert_root_to_pandas(self, file: Path):
+        '''
+        Convert the contents of a ROOT file to pandas.
 
-    '''
-    from pandas import DataFrame
+        Arguments:
 
-    def do_the_work(file: Path) -> DataFrame:
-        import uproot
+            file        A `Path` to the file containing the pandas data
 
-        f_in = uproot.open(file)
-        try:
-            r = f_in[f_in.keys()[0]]
-            return r.pandas.df()  # type: ignore
-        finally:
-            f_in._context.source.close()
+        Returns:
 
-    return await asyncio.wrap_future(_conversion_pool.submit(do_the_work, file))
+            DataFrame   A pandas dataframe
 
+        Note:
 
-async def _convert_root_to_awkward(file: Path):
-    '''
-    Convert the contents of a ROOT file to an awkward dictionary.
+            - Work is done on a second thread.
+            - Pandas is only imported if this is called.
 
-    Arguments:
+        '''
+        from pandas import DataFrame
 
-        file        A `Path` to the file containing the pandas data
+        def do_the_work(file: Path) -> DataFrame:
+            import uproot
 
-    Returns:
+            f_in = uproot.open(file)
+            try:
+                r = f_in[f_in.keys()[0]]
+                return r.pandas.df()  # type: ignore
+            finally:
+                f_in._context.source.close()
 
-        DataFrame   A pandas dataframe
+        return await asyncio.wrap_future(_conversion_pool.submit(do_the_work, file))
 
-    Note:
+    async def _convert_parquet_to_pandas(self, file: Path):
+        '''
+        Convert the contents of a parquet file to pandas.
 
-        - Work is done on a second thread.
-        - Pandas is only imported if this is called.
+        Arguments:
 
-    '''
-    from numpy import ndarray
-    from awkward import JaggedArray
+            file        A `Path` to the file containing the pandas data
+
+        Returns:
+
+            DataFrame   A pandas dataframe
+
+        Note:
+
+            - Work is done on a second thread.
+            - Pandas is only imported if this is called.
+
+        '''
+        import pandas as pd
+
+        def do_the_work(file: Path) -> pd.DataFrame:
+            return pd.read_parquet(str(file))
+
+        return await asyncio.wrap_future(_conversion_pool.submit(do_the_work, file))
+
+    async def _convert_root_to_awkward(self, file: Path):
+        '''
+        Convert the contents of a ROOT file to an awkward dictionary.
+
+        Arguments:
+
+            file        A `Path` to the file containing the pandas data
+
+        Returns:
+
+            DataFrame   A pandas dataframe
+
+        Note:
+
+            - Work is done on a second thread.
+            - Pandas is only imported if this is called.
+
+        '''
+        from numpy import ndarray
+        from awkward import JaggedArray
+
+        def do_the_work(file: Path) -> Dict[Union[str, bytes], Union[ndarray, JaggedArray]]:
+            import uproot
+
+            f_in = uproot.open(file)
+            try:
+                r = f_in[f_in.keys()[0]]
+                return r.lazyarrays()  # type: ignore
+            finally:
+                f_in._context.source.close()
+
+        return await asyncio.wrap_future(_conversion_pool.submit(do_the_work, file))
+
+    async def _convert_parquet_to_awkward(self, file: Path):
+        '''
+        Convert the contents of a parquet file to an awkward dictionary.
+
+        Arguments:
+
+            file        A `Path` to the file containing the pandas data
+
+        Returns:
+
+            DataFrame   A pandas dataframe
+
+        Note:
+
+            - Work is done on a second thread.
+            - Pandas is only imported if this is called.
 
-    def do_the_work(file: Path) -> Dict[bytes, Union[ndarray, JaggedArray]]:
-        import uproot
+        '''
+        import awkward as ak
 
-        f_in = uproot.open(file)
-        try:
-            r = f_in[f_in.keys()[0]]
-            return r.lazyarrays()  # type: ignore
-        finally:
-            f_in._context.source.close()
+        def do_the_work(file: Path) -> \
+                Union[Dict[Union[str, bytes], ak.ChunkedArray], ChunkedArray]:
+            # TODO: When we move to awkward1, make sure this becomes lazy
+            return ak.fromparquet(str(file))
 
-    return await asyncio.wrap_future(_conversion_pool.submit(do_the_work, file))
+        return await asyncio.wrap_future(_conversion_pool.submit(do_the_work, file))
@@ -2,6 +2,7 @@
 import asyncio
 import functools
 import logging
+from servicex.servicex_config import ServiceXConfigAdaptor
 import time
 from datetime import timedelta
 from pathlib import Path
@@ -11,11 +12,9 @@
 import aiohttp
 import backoff
 from backoff import on_exception
-from confuse import ConfigView
 
 from .cache import Cache
-from .ConfigSettings import ConfigSettings
-from .data_conversions import _convert_root_to_awkward, _convert_root_to_pandas
+from .data_conversions import DataConverterAdaptor
 from .minio_adaptor import (MinioAdaptor, MinioAdaptorFactory,
                             find_new_bucket_files)
 from .servicex_adaptor import (ServiceXAdaptor, servicex_adaptor_factory,
@@ -46,7 +45,8 @@ def __init__(self,
                  status_callback_factory: Optional[StatusUpdateFactory] = _run_default_wrapper,
                  local_log: log_adaptor = None,
                  session_generator: Callable[[], Awaitable[aiohttp.ClientSession]] = None,
-                 config_adaptor: ConfigView = None):
+                 config_adaptor: Optional[ServiceXConfigAdaptor] = None,
+                 data_convert_adaptor: Optional[DataConverterAdaptor] = None):
         '''
         Create and configure a ServiceX object for a dataset.
 
@@ -76,6 +76,9 @@ def __init__(self,
                                         `servicex` queries is used.
             config_adaptor              Control how configuration options are read from the
                                         `.servicex` file.
+            data_convert_adaptor        Manages conversions between root and parquet and `pandas`
+                                        and `awkward`, including default settings for expected
+                                        datatypes from the backend.
 
         Notes:
 
@@ -97,21 +100,21 @@ def __init__(self,
 
         # Get the local settings
         config = config_adaptor if config_adaptor is not None \
-            else ConfigSettings('servicex', 'servicex')
+            else ServiceXConfigAdaptor()
 
         # Establish the cache that will store all our queries
-        self._cache = Cache(get_configured_cache_path(config)) \
+        self._cache = Cache(get_configured_cache_path(config.settings)) \
             if cache_adaptor is None \
             else cache_adaptor
 
         if not servicex_adaptor:
             # Given servicex adaptor is none, this should be ok. Fixes type checkers
             assert backend_type is not None
-            servicex_adaptor = servicex_adaptor_factory(config, backend_type)
+            servicex_adaptor = servicex_adaptor_factory(config.settings, backend_type)
         self._servicex_adaptor = servicex_adaptor
 
         if not minio_adaptor:
-            self._minio_adaptor = MinioAdaptorFactory(config)
+            self._minio_adaptor = MinioAdaptorFactory(config.settings)
         else:
             if isinstance(minio_adaptor, MinioAdaptor):
                 self._minio_adaptor = MinioAdaptorFactory(always_return=minio_adaptor)
@@ -123,6 +126,9 @@ def __init__(self,
         self._session_generator = session_generator if session_generator is not None \
             else default_client_session
 
+        self._converter = data_convert_adaptor if data_convert_adaptor is not None \
+            else DataConverterAdaptor(config.get_default_returned_datatype(backend_type))
+
     @functools.wraps(ServiceXABC.get_data_rootfiles_async, updated=())
     @_wrap_in_memory_sx_cache
     async def get_data_rootfiles_async(self, selection_query: str) -> List[Path]:
@@ -137,13 +143,15 @@ async def get_data_parquet_async(self, selection_query: str) -> List[Path]:
     @_wrap_in_memory_sx_cache
     async def get_data_pandas_df_async(self, selection_query: str):
         import pandas as pd
-        return pd.concat(await self._data_return(selection_query, _convert_root_to_pandas))
+        return pd.concat(await self._data_return(
+            selection_query, lambda f: self._converter.convert_to_pandas(f)))
 
     @functools.wraps(ServiceXABC.get_data_awkward_async, updated=())
     @_wrap_in_memory_sx_cache
     async def get_data_awkward_async(self, selection_query: str):
         import awkward
-        all_data = await self._data_return(selection_query, _convert_root_to_awkward)
+        all_data = await self._data_return(
+            selection_query, lambda f: self._converter.convert_to_awkward(f))
         col_names = all_data[0].keys()
         return {c: awkward.concatenate([ar[c] for ar in all_data]) for c in col_names}