mindspore-lab
diff --git a/‎mindnlp/core/ops/reduction.py
Lines changed: 1 addition & 2 deletions b/‎mindnlp/core/ops/reduction.py
Lines changed: 1 addition & 2 deletions
diff --git a/‎mindnlp/transformers/models/auto/__init__.py
Lines changed: 2 additions & 0 deletions b/‎mindnlp/transformers/models/auto/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎mindnlp/transformers/models/auto/configuration_auto.py
Lines changed: 1 addition & 0 deletions b/‎mindnlp/transformers/models/auto/configuration_auto.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎mindnlp/transformers/models/auto/feature_extraction_auto.py
Lines changed: 3 additions & 0 deletions b/‎mindnlp/transformers/models/auto/feature_extraction_auto.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎mindnlp/transformers/models/auto/modeling_auto.py
Lines changed: 29 additions & 0 deletions b/‎mindnlp/transformers/models/auto/modeling_auto.py
Lines changed: 29 additions & 0 deletions
diff --git a/‎mindnlp/transformers/models/dpt/image_processing_dpt.py
Lines changed: 39 additions & 1 deletion b/‎mindnlp/transformers/models/dpt/image_processing_dpt.py
Lines changed: 39 additions & 1 deletion
@@ -38,8 +38,7 @@ def all(input, dim=None, keepdim=False, *, dtype=None):
 def any(input, dim=None, keepdim=False):
     if use_pyboost():
         return mindspore.mint.any(input, dim, keepdim)
-    any_ = _get_cache_prim(ops.ReduceAny)(keepdim)
-    return any_(input, dim)
+    return ops.any(input, dim)
 
 # max
 def max(input, dim=None, keepdim=False):
 
@@ -66,6 +66,7 @@
     MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING,
     MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
     MODEL_FOR_VISION_2_SEQ_MAPPING,
+    MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING,
     MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING,
     MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING,
     MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING,
@@ -147,6 +148,7 @@
     'MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING',
     'MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING',
     'MODEL_FOR_VISION_2_SEQ_MAPPING',
+    'MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING',
     'MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING',
     'MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING',
     'MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING',
 
@@ -154,6 +154,7 @@
         ("musicgen_melody", "MusicgenMelodyConfig"),
         ("mt5", "MT5Config"),
         ("mvp", "MvpConfig"),
+        ("nougat", "VisionEncoderDecoderConfig"),
         ("nystromformer", "NystromformerConfig"),
         ("olmo", "OlmoConfig"),
         ("oneformer", "OneFormerConfig"),
 
@@ -374,6 +374,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
 
         config_dict, _ = FeatureExtractionMixin.get_feature_extractor_dict(pretrained_model_name_or_path, **kwargs)
         feature_extractor_class = config_dict.get("feature_extractor_type", None)
+
         feature_extractor_auto_map = None
         if "AutoFeatureExtractor" in config_dict.get("auto_map", {}):
             feature_extractor_auto_map = config_dict["auto_map"]["AutoFeatureExtractor"]
@@ -392,6 +393,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
 
         if feature_extractor_class is not None:
             return feature_extractor_class.from_dict(config_dict, **kwargs)
+
+        print(feature_extractor_class)
         # Last try: we use the FEATURE_EXTRACTOR_MAPPING.
         if type(config) in FEATURE_EXTRACTOR_MAPPING:
             feature_extractor_class = FEATURE_EXTRACTOR_MAPPING[type(config)]
 
@@ -731,6 +731,32 @@
     ]
 )
 
+MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = OrderedDict(
+    [
+        ("blip", "BlipForConditionalGeneration"),
+        ("blip-2", "Blip2ForConditionalGeneration"),
+        ("chameleon", "ChameleonForConditionalGeneration"),
+        ("fuyu", "FuyuForCausalLM"),
+        ("git", "GitForCausalLM"),
+        ("idefics", "IdeficsForVisionText2Text"),
+        ("idefics2", "Idefics2ForConditionalGeneration"),
+        ("idefics3", "Idefics3ForConditionalGeneration"),
+        ("instructblip", "InstructBlipForConditionalGeneration"),
+        ("kosmos-2", "Kosmos2ForConditionalGeneration"),
+        ("llava", "LlavaForConditionalGeneration"),
+        ("llava_next", "LlavaNextForConditionalGeneration"),
+        ("llava_onevision", "LlavaOnevisionForConditionalGeneration"),
+        ("mllama", "MllamaForConditionalGeneration"),
+        ("paligemma", "PaliGemmaForConditionalGeneration"),
+        ("pix2struct", "Pix2StructForConditionalGeneration"),
+        ("pixtral", "LlavaForConditionalGeneration"),
+        ("qwen2_vl", "Qwen2VLForConditionalGeneration"),
+        ("udop", "UdopForConditionalGeneration"),
+        ("vipllava", "VipLlavaForConditionalGeneration"),
+        ("vision-encoder-decoder", "VisionEncoderDecoderModel"),
+    ]
+)
+
 MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict(
     [
         # Model for Masked LM mapping
@@ -1397,6 +1423,9 @@
     CONFIG_MAPPING_NAMES, MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES
 )
 MODEL_FOR_VISION_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES)
+MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
+)
 MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
     CONFIG_MAPPING_NAMES, MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES
 )
 
@@ -43,7 +43,7 @@
 
 if is_mindspore_available():
     import mindspore
-    from mindspore.ops import interpolate
+    from mindnlp.core.nn.functional import interpolate
 
 if is_vision_available():
     from PIL import Image
@@ -484,6 +484,44 @@ def post_process_semantic_segmentation(self, outputs, target_sizes: Union[List[T
 
         return semantic_segmentation
 
+    def post_process_depth_estimation(
+        self,
+        outputs: "DepthEstimatorOutput",
+        target_sizes: Optional[Union[TensorType, List[Tuple[int, int]], None]] = None,
+    ) -> List[Dict[str, TensorType]]:
+        """
+        Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images.
+        Only supports PyTorch.
+
+        Args:
+            outputs ([`DepthEstimatorOutput`]):
+                Raw outputs of the model.
+            target_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+                (height, width) of each image in the batch. If left to None, predictions will not be resized.
+
+        Returns:
+            `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth
+            predictions.
+        """
+        predicted_depth = outputs.predicted_depth
+
+        if (target_sizes is not None) and (len(predicted_depth) != len(target_sizes)):
+            raise ValueError(
+                "Make sure that you pass in as many target sizes as the batch dimension of the predicted depth"
+            )
+
+        results = []
+        target_sizes = [None] * len(predicted_depth) if target_sizes is None else target_sizes
+        for depth, target_size in zip(predicted_depth, target_sizes):
+            if target_size is not None:
+                depth = interpolate(
+                    depth.unsqueeze(0).unsqueeze(1), size=target_size, mode="bicubic", align_corners=False
+                ).squeeze()
+
+            results.append({"predicted_depth": depth})
+
+        return results
 
 __all__ = [
     'DPTImageProcessor',