ssl-hep
diff --git a/‎.vscode/settings.json
Lines changed: 4 additions & 4 deletions b/‎.vscode/settings.json
Lines changed: 4 additions & 4 deletions
diff --git a/‎README.md
Lines changed: 6 additions & 5 deletions b/‎README.md
Lines changed: 6 additions & 5 deletions
diff --git a/‎servicex/config_default.yaml
Lines changed: 19 additions & 3 deletions b/‎servicex/config_default.yaml
Lines changed: 19 additions & 3 deletions
diff --git a/‎servicex/data_conversions.py
Lines changed: 161 additions & 44 deletions b/‎servicex/data_conversions.py
Lines changed: 161 additions & 44 deletions
@@ -26,6 +26,8 @@
         "aexit",
         "aiohttp",
         "asyncio",
+        "awks",
+        "backends",
         "cacheme",
         "codecov",
         "dcache",
@@ -55,6 +57,7 @@
         "prereleased",
         "protomolecule",
         "ptetaphi",
+        "pyarrow",
         "pypa",
         "pypi",
         "pytest",
@@ -76,8 +79,5 @@
         "xaod",
         "xrootd"
     ],
-    "python.analysis.typeCheckingMode": "basic",
-    "python.testing.pytestArgs": [
-        "--no-cov"
-    ]
+    "python.analysis.typeCheckingMode": "basic"
 }
@@ -35,10 +35,9 @@ The `servicex` library searches for configuration information in several locatio
 
 If no endpoint is specified, then the library defaults to the developer endpoint, which is `http://localhost:5000` for the web-service API, and `localhost:9000` for the `minio` endpoint. No passwords are required.
 
-Create a `.servicex` file, in the `yaml` format, in the appropriate place for your work that contains the following:
+Create a `.servicex` file, in the `yaml` format, in the appropriate place for your work that contains the following (for the `xaod` backend; use `uproot` for the uproot backend):
 
 ```yaml
-<<<<<<< HEAD
 api_endpoints:
   - endpoint: <your-endpoint>
     token: <api-token>
@@ -59,7 +58,7 @@ The following lines will return a `pandas.DataFrame` containing all the jet pT's
     from servicex import ServiceX
     query = "(call ResultTTree (call Select (call SelectMany (call EventDataset (list 'localds:bogus')) (lambda (list e) (call (attr e 'Jets') 'AntiKt4EMTopoJets'))) (lambda (list j) (/ (call (attr j 'pt')) 1000.0))) (list 'JetPt') 'analysis' 'junk.root')"
     dataset = "mc15_13TeV:mc15_13TeV.361106.PowhegPythia8EvtGen_AZNLOCTEQ6L1_Zee.merge.DAOD_STDM3.e3601_s2576_s2132_r6630_r6264_p2363_tid05630052_00"
-    ds = ServiceXDataset(dataset, 'xaod')
+    ds = ServiceXDataset(dataset)
     r = ds.get_data_pandas_df(query)
     print(r)
 ```
@@ -105,6 +104,7 @@ The file can contain an `api_endpoint` as mentioned above. In addition the other
   - Linux: `cache_path: "/home/servicex-cache"`
 
 - `minio_endpoint`, `minio_username`, `minio_password` - these are only interesting if you are using a pre-RC2 release of `servicex` - when the `minio` information wasn't part of the API exchange. This feature is depreciated and will be removed around the time `servicex` moves to RC3.
+- `backend_types` - a list of yaml dictionaries that contains some defaults for the backends. By default only the `return_data` is there, which for `xaod` is `root` and `uproot` is `parquet`. Allows `servicex` to convert to `pandas.DataFrame` or `awkward` if requested by the user.
 
 All strings are expanded using python's [os.path.expand](https://docs.python.org/3/library/os.path.html#os.path.expandvars) method - so `$NAME` and `${NAME}` will work to expand existing environment variables.
 
@@ -161,8 +161,9 @@ Everything is based around the `ServiceXDataset` object. Below is the documentat
           dataset                     Name of a dataset from which queries will be selected.
           backend_type                The type of backend. Used only if we need to find an
                                       end-point. If we do not have a `servicex_adaptor` then this
-                                      cannot be null. Possible types are `uproot`, `xaod`,
-                                      and anything that finds a match in the `.servicex` file.
+                                      will default to xaod, unless you have any endpoint listed
+                                      in your servicex file. It will default to best match there,
+                                      in that case.
           image                       Name of transformer image to use to transform the data
           max_workers                 Maximum number of transformers to run simultaneously on
                                       ServiceX.
 
@@ -1,10 +1,13 @@
 # Default settings for servicex. This will point you to a developer end-point, that
 # you've setup on your own machine (usually using k8's port-forward command):
 
-api_endpoint:
-  endpoint: http://localhost:5000
-  # token: xxx
+api_endpoints:
+  - endpoint: http://localhost:5000
+    # token: xxx
 
+# These are default settings, depreciated, and should not be used.
+# They will be removed in the next version.
+api_endpoint:
   minio_endpoint: localhost:9000
   # The username and password for accessing files generated by servicex.
   # NOTE:
@@ -22,3 +25,16 @@ cache_path: /tmp/servicex
 # This is a dummy value, here only to make sure that unit testing
 # works properly before package release.
 testing_value: 10
+
+# If we can't figure out what backend the user is going to use, we
+# return this sort of file. Parquet for the uproot backend, and root for the
+# xaod backend.
+default_return_data: parquet
+
+# Defaults for the various types of servicex backends that we might deal with.
+# Easy enough to add a new one here...
+backend_types:
+  - type: xaod
+    return_data: root
+  - type: uproot
+    return_data: parquet
@@ -1,73 +1,190 @@
-from pathlib import Path
-from concurrent.futures import ThreadPoolExecutor
 import asyncio
-from typing import Dict, Union
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+from typing import Dict, Iterable, Optional, Union
+from awkward.array.chunked import ChunkedArray
+from awkward.array.table import Table
+
+import pandas as pd
+import awkward as ak
 
-_conversion_pool = ThreadPoolExecutor(2)
+from servicex.utils import ServiceXException
 
+_conversion_pool = ThreadPoolExecutor(4)
 
-async def _convert_root_to_pandas(file: Path):
+
+class DataConverterAdaptor:
+    '''Methods to convert from one type of data to the other.
     '''
-    Convert the contents of a ROOT file to pandas.
+    def __init__(self, default_file_type: str):
+        '''Create a data converter adaptor. By default it will do the
+        conversation as requested.
 
-    Arguments:
+        Args:
+            default_file_type (str): The default file type (`parquet` or `root`)
+        '''
+        self._default_file_type = default_file_type
 
-        file        A `Path` to the file containing the pandas data
+    async def convert_to_pandas(self, file: Path, file_type: Optional[str] = None):
+        '''Convert to a pandas dataframe from data stored in a file of a particular file_type
 
-    Returns:
+        Args:
+            file (Path): Path to the file
+            file_type (str): What the file contains (root, parquet, etc)
+        '''
+        file_type = file_type if file_type is not None else self._default_file_type
+        if file_type == 'root':
+            return await self._convert_root_to_pandas(file)
+        elif file_type == 'parquet':
+            return await self._convert_parquet_to_pandas(file)
+        else:
+            raise ServiceXException(f'Conversion from {file_type} into an pandas DF is not '
+                                    'yet supported')
 
-        DataFrame   A pandas dataframe
+    async def convert_to_awkward(self, file: Path, file_type: Optional[str] = None):
+        '''Convert to an awkward data array from data stored in a file of a particular file_type
 
-    Note:
+        Args:
+            file (Path): Path to the file
+            file_type (str): What the file contains (root, parquet, etc)
+        '''
+        file_type = file_type if file_type is not None else self._default_file_type
+        if file_type == 'root':
+            return await self._convert_root_to_awkward(file)
+        elif file_type == 'parquet':
+            return await self._convert_parquet_to_awkward(file)
+        else:
+            raise ServiceXException(f'Conversion from {file_type} into an awkward array is not '
+                                    'yet supported')
 
-        - Work is done on a second thread.
-        - Pandas is only imported if this is called.
+    def combine_pandas(self, dfs: Iterable[pd.DataFrame]) -> pd.DataFrame:
+        '''Combine many pandas dataframes into a single one, in order.
 
-    '''
-    from pandas import DataFrame
+        Args:
+            dfs (Iterable[pd.DataFrame]): The list of dataframes
+        '''
+        return pd.concat(dfs)
 
-    def do_the_work(file: Path) -> DataFrame:
-        import uproot
+    def combine_awkward(self, awks: Iterable[Union[Table, ChunkedArray]]) -> Table:
+        '''Combine many awkward arrays into a single one, in order.
 
-        f_in = uproot.open(file)
-        try:
-            r = f_in[f_in.keys()[0]]
-            return r.pandas.df()  # type: ignore
-        finally:
-            f_in._context.source.close()
+        Args:
+            awks (Iterable[ChunkedArray]): The input list of awkward arrays
+        '''
+        return ak.concatenate(awks)
 
-    return await asyncio.wrap_future(_conversion_pool.submit(do_the_work, file))
+    async def _convert_root_to_pandas(self, file: Path):
+        '''
+        Convert the contents of a ROOT file to pandas.
 
+        Arguments:
 
-async def _convert_root_to_awkward(file: Path):
-    '''
-    Convert the contents of a ROOT file to an awkward dictionary.
+            file        A `Path` to the file containing the pandas data
 
-    Arguments:
+        Returns:
 
-        file        A `Path` to the file containing the pandas data
+            DataFrame   A pandas dataframe
 
-    Returns:
+        Note:
 
-        DataFrame   A pandas dataframe
+            - Work is done on a second thread.
+            - Pandas is only imported if this is called.
 
-    Note:
+        '''
+        from pandas import DataFrame
 
-        - Work is done on a second thread.
-        - Pandas is only imported if this is called.
+        def do_the_work(file: Path) -> DataFrame:
+            import uproot
 
-    '''
-    from numpy import ndarray
-    from awkward import JaggedArray
+            f_in = uproot.open(file)
+            try:
+                r = f_in[f_in.keys()[0]]
+                return r.pandas.df()  # type: ignore
+            finally:
+                f_in._context.source.close()
+
+        return await asyncio.wrap_future(_conversion_pool.submit(do_the_work, file))
+
+    async def _convert_parquet_to_pandas(self, file: Path):
+        '''
+        Convert the contents of a parquet file to pandas.
+
+        Arguments:
+
+            file        A `Path` to the file containing the pandas data
+
+        Returns:
+
+            DataFrame   A pandas dataframe
+
+        Note:
+
+            - Work is done on a second thread.
+            - Pandas is only imported if this is called.
+
+        '''
+        import pandas as pd
+
+        def do_the_work(file: Path) -> pd.DataFrame:
+            return pd.read_parquet(str(file))
 
-    def do_the_work(file: Path) -> Dict[bytes, Union[ndarray, JaggedArray]]:
-        import uproot
+        return await asyncio.wrap_future(_conversion_pool.submit(do_the_work, file))
 
-        f_in = uproot.open(file)
-        try:
+    async def _convert_root_to_awkward(self, file: Path):
+        '''
+        Convert the contents of a ROOT file to an awkward dictionary.
+
+        Arguments:
+
+            file        A `Path` to the file containing the pandas data
+
+        Returns:
+
+            DataFrame   A pandas dataframe
+
+        Note:
+
+            - Work is done on a second thread.
+            - Awkward is only imported if this is called.
+            - A LazyArray is returned, so it isn't completely loaded into memory. That also means
+              this will leak filehandles - as that has to be left open.
+
+        '''
+        from numpy import ndarray
+        from awkward import JaggedArray
+
+        def do_the_work(file: Path) -> Dict[Union[str, bytes], Union[ndarray, JaggedArray]]:
+            import uproot
+
+            f_in = uproot.open(file)
             r = f_in[f_in.keys()[0]]
             return r.lazyarrays()  # type: ignore
-        finally:
-            f_in._context.source.close()
 
-    return await asyncio.wrap_future(_conversion_pool.submit(do_the_work, file))
+        return await asyncio.wrap_future(_conversion_pool.submit(do_the_work, file))
+
+    async def _convert_parquet_to_awkward(self, file: Path):
+        '''
+        Convert the contents of a parquet file to an awkward dictionary.
+
+        Arguments:
+
+            file        A `Path` to the file containing the pandas data
+
+        Returns:
+
+            DataFrame   A pandas dataframe
+
+        Note:
+
+            - Work is done on a second thread.
+            - Pandas is only imported if this is called.
+
+        '''
+        import awkward as ak
+
+        def do_the_work(file: Path) -> \
+                Union[Dict[Union[str, bytes], ak.ChunkedArray], ChunkedArray]:
+            # TODO: When we move to awkward1, make sure this becomes lazy
+            return ak.fromparquet(str(file))
+
+        return await asyncio.wrap_future(_conversion_pool.submit(do_the_work, file))