@@ -524,42 +524,6 @@ def is_child_path(child_path: os.PathLike[str], parent_path: os.PathLike[str]) -
524
524
)
525
525
526
526
527
- # TODO(phw2): deprecate this once I'm done with unittest_workspace.py
528
- def open_and_save (
529
- dbgym_workspace : DBGymWorkspace , open_path : Path , mode : str = "r"
530
- ) -> IO [Any ]:
531
- """
532
- Open a file and "save" it to [workspace]/task_runs/run_*/.
533
- It takes in a str | Path to match the interface of open().
534
- This file does not work if open_path is a symlink, to make its interface identical to that of open().
535
- Make sure to resolve all symlinks with fully_resolve_path().
536
- To avoid confusion, I'm enforcing this function to only work with absolute paths.
537
- # TODO: maybe make it work on non-fully-resolved paths to better match open()
538
- See the comment of save_file() for what "saving" means
539
- If you are generating a "result" for the run, _do not_ use this. Just use the normal open().
540
- This shouldn't be too hard to remember because this function crashes if open_path doesn't exist,
541
- and when you write results you're usually opening open_paths which do not exist.
542
- """
543
- # validate open_path
544
- assert isinstance (open_path , Path )
545
- assert is_fully_resolved (
546
- open_path
547
- ), f"open_and_save(): open_path ({ open_path } ) should be a fully resolved path"
548
- assert not os .path .islink (
549
- open_path
550
- ), f"open_path ({ open_path } ) should not be a symlink"
551
- assert os .path .exists (open_path ), f"open_path ({ open_path } ) does not exist"
552
- # open_and_save *must* be called on files because it doesn't make sense to open a directory. note that this doesn't mean we'll always save
553
- # a file though. we sometimes save a directory (see save_file() for details)
554
- assert os .path .isfile (open_path ), f"open_path ({ open_path } ) is not a file"
555
-
556
- # save
557
- save_file (dbgym_workspace , open_path )
558
-
559
- # open
560
- return open (open_path , mode = mode )
561
-
562
-
563
527
def extract_from_task_run_path (
564
528
dbgym_workspace : DBGymWorkspace , task_run_path : Path
565
529
) -> tuple [Path , str , Path , str ]:
@@ -593,128 +557,6 @@ def extract_from_task_run_path(
593
557
return codebase_path , codebase_dname , org_path , org_dname
594
558
595
559
596
- # TODO(phw2): deprecate this once I'm done with unittest_workspace.py
597
- def save_file (dbgym_workspace : DBGymWorkspace , path : Path ) -> None :
598
- """
599
- If an external function takes in a file/directory as input, you will not be able to call open_and_save().
600
- In these situations, just call save_file().
601
- Like open_and_save(), this function only works with real absolute paths.
602
- "Saving" can mean either copying the file or creating a symlink to it
603
- We copy the file if it is a "config", meaning it just exists without having been generated
604
- We create a symlink if it is a "dependency", meaning a task.py command was run to generate it
605
- In these cases we create a symlink so we have full provenance for how the dependency was created
606
- """
607
- # validate path
608
- assert is_fully_resolved (path ), f"path ({ path } ) should be a fully resolved path"
609
- assert os .path .isfile (path ), f"path ({ path } ) should be a file"
610
- assert not is_child_path (
611
- path , dbgym_workspace .dbgym_this_run_path
612
- ), f"path ({ path } ) was generated in this task run ({ dbgym_workspace .dbgym_this_run_path } ). You do not need to save it"
613
-
614
- # save _something_ to dbgym_this_run_path
615
- # save a symlink if the opened file was generated by a run. this is for two reasons:
616
- # 1. files or dirs generated by a run are supposed to be immutable so saving a symlink is safe
617
- # 2. files or dirs generated by a run may be very large (up to 100s of GBs) so we don't want to copy them
618
- if is_child_path (path , dbgym_workspace .dbgym_runs_path ):
619
- # get paths we'll need later.
620
- _ , codebase_dname , org_path , org_dname = extract_from_task_run_path (
621
- dbgym_workspace , path
622
- )
623
- this_run_save_path = (
624
- dbgym_workspace .dbgym_this_run_path / codebase_dname / org_dname
625
- )
626
- os .makedirs (this_run_save_path , exist_ok = True )
627
-
628
- # if the path file is directly in org_path, we symlink the file directly
629
- parent_path = parent_path_of_path (path )
630
- if parent_path .samefile (org_path ):
631
- fname = basename_of_path (path )
632
- symlink_path = this_run_save_path / (fname + ".link" )
633
- try_create_symlink (path , symlink_path )
634
- # else, we know the path file is _not_ directly inside org_path dir
635
- # we go as far back as we can while still staying in org_path and symlink that "base" dir
636
- # this is because lots of runs create dirs within org_path and it's just a waste of space to symlink every individual file
637
- else :
638
- # set base_path such that its parent is org_path
639
- base_path = parent_path
640
- while not parent_path_of_path (base_path ).samefile (org_path ):
641
- base_path = parent_path_of_path (base_path )
642
-
643
- # create symlink
644
- open_base_dname = basename_of_path (base_path )
645
- symlink_path = this_run_save_path / (open_base_dname + ".link" )
646
- try_create_symlink (base_path , symlink_path )
647
- # if it wasn't generated by a run
648
- else :
649
- # since we don't know where the file is at all, the location is "unknown" and the org is "all"
650
- this_run_save_path = dbgym_workspace .dbgym_this_run_path / "unknown" / "all"
651
- os .makedirs (this_run_save_path , exist_ok = True )
652
- fname = basename_of_path (path )
653
- # in this case, we want to copy instead of symlinking since it might disappear in the future
654
- copy_path = this_run_save_path / fname
655
- shutil .copy (path , copy_path )
656
-
657
-
658
- # TODO(phw2): deprecate this once I'm done with unittest_workspace.py
659
- def link_result (
660
- dbgym_workspace : DBGymWorkspace ,
661
- result_path : Path ,
662
- custom_result_name : Optional [str ] = None ,
663
- ) -> Path :
664
- """
665
- result_path must be a "result", meaning it was generated inside dbgym_workspace.dbgym_this_run_path.
666
- Further, result_path must have been generated by this invocation to task.py. This also means that
667
- result_path itself can be a file or a dir but not a symlink.
668
- Given a file or directory in task_runs/run_*/[codebase]/[org], this will create a symlink inside
669
- symlinks/[codebase]/[org]/.
670
- Will override the old symlink if there is one, so that symlinks/ always contains the latest generated
671
- version of a file.
672
- This function will return the path to the symlink that was created.
673
- """
674
- assert isinstance (result_path , Path )
675
- assert is_fully_resolved (
676
- result_path
677
- ), f"result_path ({ result_path } ) should be a fully resolved path"
678
- assert is_child_path (result_path , dbgym_workspace .dbgym_this_run_path )
679
- assert not os .path .islink (result_path )
680
-
681
- if type (custom_result_name ) is str :
682
- result_name = custom_result_name
683
- else :
684
- if os .path .isfile (result_path ):
685
- result_name = basename_of_path (result_path ) + ".link"
686
- elif os .path .isdir (result_path ):
687
- result_name = basename_of_path (result_path ) + ".link"
688
- else :
689
- raise AssertionError ("result_path must be either a file or dir" )
690
-
691
- # Figure out the parent directory path of the symlink
692
- codebase_path , codebase_dname , _ , org_dname = extract_from_task_run_path (
693
- dbgym_workspace , result_path
694
- )
695
- # We're only supposed to save files generated by us, which means they should be in cur_task_runs_path()
696
- assert codebase_path .samefile (
697
- dbgym_workspace .cur_task_runs_path ()
698
- ), f"link_result should only be called on files generated by this invocation to task.py"
699
- symlink_parent_path = (
700
- dbgym_workspace .dbgym_symlinks_path / codebase_dname / org_dname
701
- )
702
- symlink_parent_path .mkdir (parents = True , exist_ok = True )
703
-
704
- # Remove the old symlink ("old" meaning created in an earlier run) if there is one
705
- # Note that in a multi-threaded setting, this might remove one created by a process in the same run,
706
- # meaning it's not "old" by our definition of "old". However, we'll always end up with a symlink
707
- # file of the current run regardless of the order of threads.
708
- assert result_name .endswith (".link" ) and not result_name .endswith (
709
- ".link.link"
710
- ), f'result_name ({ result_name } ) should end with ".link"'
711
- symlink_path = symlink_parent_path / result_name
712
- try_remove_file (symlink_path )
713
- try_create_symlink (result_path , symlink_path )
714
-
715
- return symlink_path
716
-
717
-
718
560
def try_create_symlink (src_path : Path , dst_path : Path ) -> None :
719
561
"""
720
562
Our functions that create symlinks might be called by multiple processes at once
0 commit comments