Skip to content

packaging #2438

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions metaflow/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from .exception import CommandException, MetaflowException
from .flowspec import _FlowState
from .graph import FlowGraph
from .meta_files import read_included_dist_info
from .metaflow_config import (
DEFAULT_DATASTORE,
DEFAULT_DECOSPECS,
Expand All @@ -27,6 +28,7 @@
from .metaflow_current import current
from metaflow.system import _system_monitor, _system_logger
from .metaflow_environment import MetaflowEnvironment
from .package.mfenv import PackagedDistributionFinder
from .plugins import (
DATASTORES,
ENVIRONMENTS,
Expand Down Expand Up @@ -326,6 +328,11 @@ def start(
echo(" executing *%s*" % ctx.obj.flow.name, fg="magenta", nl=False)
echo(" for *%s*" % resolve_identity(), fg="magenta")

# Check if we need to setup the distribution finder (if running )
dist_info = read_included_dist_info()
if dist_info:
sys.meta_path.append(PackagedDistributionFinder(dist_info))

# Setup the context
cli_args._set_top_kwargs(ctx.params)
ctx.obj.echo = echo
Expand Down
12 changes: 8 additions & 4 deletions metaflow/client/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,13 @@
from metaflow.includefile import IncludedFile
from metaflow.metaflow_config import DEFAULT_METADATA, MAX_ATTEMPTS
from metaflow.metaflow_environment import MetaflowEnvironment
from metaflow.meta_files import MFCONF_DIR, MFENV_DIR
from metaflow.package.mfenv import MFEnv
from metaflow.plugins import ENVIRONMENTS, METADATA_PROVIDERS
from metaflow.meta_files import MetaFile
from metaflow.unbounded_foreach import CONTROL_TASK_TAG
from metaflow.util import cached_property, is_stringish, resolve_identity, to_unicode

from ..info_file import INFO_FILE
from .filecache import FileCache

if TYPE_CHECKING:
Expand Down Expand Up @@ -824,9 +826,8 @@ def __init__(self, flow_name: str, code_package: str):
)
code_obj = BytesIO(blobdata)
self._tar = tarfile.open(fileobj=code_obj, mode="r:gz")
# The JSON module in Python3 deals with Unicode. Tar gives bytes.
info_str = (
self._tar.extractfile(os.path.basename(INFO_FILE)).read().decode("utf-8")
info_str = MFEnv.get_archive_content(self._tar, MetaFile.INFO_FILE).decode(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this could be cleaned up better - MFEnv.from_archive(self._tar).get_info()

encoding="utf-8"
)
self._info = json.loads(info_str)
self._flowspec = self._tar.extractfile(self._info["script"]).read()
Expand Down Expand Up @@ -917,6 +918,9 @@ def extract(self) -> TemporaryDirectory:
# This file is created when using the conda/pypi features available in
# nflx-metaflow-extensions: https://github.com/Netflix/metaflow-nflx-extensions
"condav2-1.cnd",
# Going forward, we only need to exclude MFENV_DIR and MFCONF_DIR
MFENV_DIR,
MFCONF_DIR,
]
members = [
m
Expand Down
43 changes: 23 additions & 20 deletions metaflow/decorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,14 @@
from .parameters import current_flow
from .user_configs.config_decorators import CustomStepDecorator
from .user_configs.config_parameters import (
ConfigValue,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not used

UNPACK_KEY,
resolve_delayed_evaluator,
unpack_delayed_evaluator,
)

from metaflow._vendor import click

try:
unicode
except NameError:
unicode = str
basestring = str
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can pull out Python2 changes in a separate PR

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

THere are definitely other changes. We can revert this but I think we can leave this here for now and yes, do a separate PR to pull out all the other ones.


# Contains the decorators on which _init was called. We want to ensure it is called
# only once on each decorator and, as the _init() function below can be called in
Expand Down Expand Up @@ -189,7 +185,7 @@ def make_decorator_spec(self):
# escaping but for more complex types (typically dictionaries or lists),
# we dump using JSON.
for k, v in attrs.items():
if isinstance(v, (int, float, unicode, basestring)):
if isinstance(v, (int, float, str)):
attr_list.append("%s=%s" % (k, str(v)))
else:
attr_list.append("%s=%s" % (k, json.dumps(v).replace('"', '\\"')))
Expand Down Expand Up @@ -315,15 +311,26 @@ def package_init(self, flow, step_name, environment):

def add_to_package(self):
"""
Called to add custom packages needed for a decorator. This hook will be
Called to add custom files needed for a decorator. This hook will be
called in the `MetaflowPackage` class where metaflow compiles the code package
tarball. This hook is invoked in the `MetaflowPackage`'s `path_tuples`
function. The `path_tuples` function is a generator that yields a tuple of
`(file_path, arcname)`.`file_path` is the path of the file in the local file system;
the `arcname` is the path of the file in the constructed tarball or the path of the file
after decompressing the tarball.

Returns a list of tuples where each tuple represents (file_path, arcname)
tarball. This hook can return one of two things:
- a generator yielding a tuple of `(file_path, arcname)` to add files to
the code package. `file_path` is the path to the file on the local filesystem
and `arcname` is the path relative to the packaged code.
- a generator yielding a tuple of `(content, arcname, type)` where:
- type is a AddToPackageType
- for CODE_FILE:
- content: path to the file to include
- arcname: path relative to the code directory in the package
- for CODE_MODULE:
- content: name of the module
- arcame: None (ignored)
- for CONFIG_FILE:
- content: path to the file to include
- arcname: path relative to the config directory in the package
- for CONFIG_CONTENT:
- content: bytes to include
- arcname: path relative to the config directory in the package
Comment on lines +316 to +333
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is quite funky - returning one of two things with different types. need to fix this

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you suggest? As I mentioned:

  • we can make it a breaking change (change the signature of add_package)
  • we can add a different method (feels wonky too)
  • ??

"""
return []

Expand Down Expand Up @@ -685,12 +692,8 @@ def foo(self):
f.is_step = True
f.decorators = []
f.config_decorators = []
try:
# python 3
f.name = f.__name__
except:
# python 2
f.name = f.__func__.func_name
f.wrappers = []
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is actually not related to this PR. It slipped through.

f.name = f.__name__
return f


Expand Down
2 changes: 1 addition & 1 deletion metaflow/extension_support/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from itertools import chain
from pathlib import Path

from metaflow.info_file import read_info_file
from metaflow.meta_files import read_info_file


#
Expand Down
4 changes: 2 additions & 2 deletions metaflow/extension_support/_empty_file.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# This file serves as a __init__.py for metaflow_extensions when it is packaged
# and needs to remain empty.
# This file serves as a __init__.py for metaflow_extensions or metaflow
# packages when they are packaged and needs to remain empty.
25 changes: 0 additions & 25 deletions metaflow/info_file.py

This file was deleted.

132 changes: 132 additions & 0 deletions metaflow/meta_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
import json
import os

from enum import Enum
from typing import Any, Dict, Optional, Union

from .util import get_metaflow_root

_info_file_content = None
_info_file_present = None
_included_dist_info = None
_included_dist_present = None

# Ideally these would be in package/mfenv.py but that would cause imports to fail so
# moving here. The reason is that this is needed to read extension information which needs
# to happen before mfenv gets packaged.

MFENV_DIR = (
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

change MFENV to MFCODE. we already have --environment and @Environment - MFENV will only lead to more confusion here and everywhere.

".mfenv" # Directory containing "system" code (metaflow and user dependencies)
)
MFCONF_DIR = ".mfconf" # Directory containing Metaflow's configuration files
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

consider changing the name to something else since it is neither metaflow config not the new config feature

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So we can have MFCODE_DIR and MFINFO_DIR ?

MFENV_MARKER = (
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can't this be just in the info file? will simplify things quite a bit

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would make it less flexible. The marker tells you how to access the info file. The info file is not in the same place already.

".mfenv_install" # Special file containing metadata about how Metaflow is packaged
)


class MetaFile(Enum):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for future generations looking at the code, it will be immensely useful to call out in detail what is included in these files.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure. I can add this. We didn't add any file (except dist info) but can make it more clear as part of making the code better.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

btw, do we need to have three separate files? any value in merging all/some?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can definitely have them all as one. I vaguely remember you being in favor of separate files in the past but I have no strong preference.

INFO_FILE = "INFO"
CONFIG_FILE = "CONFIG_PARAMETERS"
INCLUDED_DIST_INFO = "INCLUDED_DIST_INFO"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is INCLUDED_DIST_INFO - hard to understand what this implies - is there an EXCLUDED_DIST_INFO too somewhere?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can clarify: it means the information about any distribution that is included. So no, there is no excluded one. I can just hame it DIST_INFO.



def meta_file_name(name: Union[MetaFile, str]) -> str:
if isinstance(name, MetaFile):
return name.value
return name


def generic_get_filename(
name: Union[MetaFile, str], is_meta: Optional[bool] = None
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is_meta is not actually used. consider dropping

) -> Optional[str]:
# We are not in a MFEnv package (so this is an old style package). Everything
# is at metaflow root. There is no distinction between code and config.
real_name = meta_file_name(name)

path_to_file = os.path.join(get_metaflow_root(), real_name)
if os.path.isfile(path_to_file):
return path_to_file
return None


def v1_get_filename(
name: Union[MetaFile, str],
meta_info: Dict[str, Any],
is_meta: Optional[bool] = None,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how does this work if i am interested in a file that unfortunately clashes with an internal reserved name? also, is_meta actually useful? what if we simply had another method that provided the contents of INFO file and it included everything you need?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

perhaps by setting is_meta to false actively somewhere?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let me check on that. The idea of putting stuff in mfconf is that they won't clash with code.

) -> Optional[str]:
if is_meta is None:
is_meta = isinstance(name, MetaFile)
if is_meta:
conf_dir = meta_info.get("conf_dir")
if conf_dir is None:
raise ValueError(
"Invalid package -- package info does not contain conf_dir key"
)
return os.path.join(conf_dir, meta_file_name(name))
# Not meta -- so code
code_dir = meta_info.get("code_dir")
if code_dir is None:
raise ValueError(
"Invalid package -- package info does not contain code_dir key"
)
return os.path.join(code_dir, meta_file_name(name))


get_filname_map = {1: v1_get_filename}


def get_filename(
name: Union[MetaFile, str], is_meta: Optional[bool] = None
) -> Optional[str]:
if os.path.exists(os.path.join(get_metaflow_root(), MFENV_MARKER)):
with open(
os.path.join(get_metaflow_root(), MFENV_MARKER), "r", encoding="utf-8"
) as f:
meta_info = json.load(f)
version = meta_info.get("version")
if version not in get_filname_map:
raise ValueError(
"Unsupported packaging version '%s'. Please update Metaflow" % version
)
Comment on lines +87 to +90
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will we actually ever hit this?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes -- if a newer version of Metaflow has a newer packaging method and an old client is used to access stuff.

return get_filname_map[version](name, meta_info, is_meta)
return generic_get_filename(name, is_meta)


def read_info_file():
# The info file is a bit special because it needs to be read to determine what
# extensions to load. We need to therefore not load anything yet. This explains
# the slightly wheird code structure where there is a bit of the file loading logic
# here that is then used in MFEnv (it logically belongs in MFEnv but that file can't
# be loaded just yet).
global _info_file_content
global _info_file_present
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need this extra global

_UNINITIALISED = object()         # sentinel
_info_file_content = _UNINITIALISED


def read_info_file():
    """Return the parsed info.json once, or None if the file is absent."""
    global _info_file_content

    if _info_file_content is _UNINITIALISED:
        info_filename = get_filename(MetaFile.INFO_FILE)
        if info_filename is not None:
            with open(info_filename, "r", encoding="utf-8") as f:
                _info_file_content = json.load(f)
        else:
            _info_file_content = None

    return _info_file_content

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can make that change. Technically there is an extra global too (the _UNINITIALIZED) but it can be shared.


if _info_file_present is None:
info_filename = get_filename(MetaFile.INFO_FILE)
if info_filename is not None:
with open(info_filename, "r", encoding="utf-8") as f:
_info_file_content = json.load(f)
_info_file_present = True
else:
_info_file_present = False
if _info_file_present:
return _info_file_content
return None


def read_included_dist_info():
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

consider renaming to read_dist_info()

global _included_dist_info
global _included_dist_present
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same comment as above


from metaflow.package.mfenv import MFEnv

if _included_dist_present is None:
c = MFEnv.get_content(MetaFile.INCLUDED_DIST_INFO)
if c is not None:
_included_dist_info = json.loads(c.decode("utf-8"))
_included_dist_present = True
else:
_included_dist_present = False
if _included_dist_present:
return _included_dist_info
return None
1 change: 1 addition & 0 deletions metaflow/metaflow_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,7 @@
"stubgen",
"userconf",
"conda",
"package",
]

for typ in DEBUG_OPTIONS:
Expand Down
25 changes: 23 additions & 2 deletions metaflow/metaflow_environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from metaflow.exception import MetaflowException
from metaflow.extension_support import dump_module_info
from metaflow.mflog import BASH_MFLOG, BASH_FLUSH_LOGS

from .meta_files import MFENV_DIR
from . import R


Expand Down Expand Up @@ -49,8 +51,26 @@ def bootstrap_commands(self, step_name, datastore_type):

def add_to_package(self):
"""
A list of tuples (file, arcname) to add to the job package.
`arcname` is an alternative name for the file in the job package.
Called to add custom files needed for this environment. This hook will be
called in the `MetaflowPackage` class where metaflow compiles the code package
tarball. This hook can return one of two things:
- a generator yielding a tuple of `(file_path, arcname)` to add files to
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a bit funky, we shouldn't return two types of tuples

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

already addressed above with a question.

the code package. `file_path` is the path to the file on the local filesystem
and `arcname` is the path relative to the packaged code.
- a generator yielding a tuple of `(content, arcname, type)` where:
- type is a AddToPackageType
- for CODE_FILE:
- content: path to the file to include
- arcname: path relative to the code directory in the package
- for CODE_MODULE:
- content: name of the module
- arcame: None (ignored)
- for CONFIG_FILE:
- content: path to the file to include
- arcname: path relative to the config directory in the package
- for CONFIG_CONTENT:
- content: bytes to include
- arcname: path relative to the config directory in the package
"""
return []

Expand Down Expand Up @@ -177,6 +197,7 @@ def get_package_commands(self, code_package_url, datastore_type):
"after 6 tries. Exiting...' && exit 1; "
"fi" % code_package_url,
"TAR_OPTIONS='--warning=no-timestamp' tar xf job.tar",
"export PYTHONPATH=`pwd`/%s:$PYTHONPATH" % MFENV_DIR,
"mflog 'Task is starting.'",
"flush_mflogs",
]
Expand Down
2 changes: 1 addition & 1 deletion metaflow/metaflow_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from os import path, name, environ, listdir

from metaflow.extension_support import update_package_info
from metaflow.info_file import CURRENT_DIRECTORY, read_info_file
from metaflow.meta_files import read_info_file


# True/False correspond to the value `public`` in get_version
Expand Down
Loading
Loading