Skip to content

Commit d469cfd

Browse files
authored
Merge pull request #757 from watson-developer-cloud/feat/disco-analyze-document
feat(DiscoveryV2): add support for analyze document
2 parents 5dcd167 + 6353f53 commit d469cfd

File tree

3 files changed

+352
-2
lines changed

3 files changed

+352
-2
lines changed

ibm_watson/discovery_v2.py

Lines changed: 243 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1159,6 +1159,87 @@ def update_training_query(self,
11591159
response = self.send(request)
11601160
return response
11611161

1162+
#########################
1163+
# analyze
1164+
#########################
1165+
1166+
def analyze_document(self,
1167+
project_id: str,
1168+
collection_id: str,
1169+
*,
1170+
file: BinaryIO = None,
1171+
filename: str = None,
1172+
file_content_type: str = None,
1173+
metadata: str = None,
1174+
**kwargs) -> 'DetailedResponse':
1175+
"""
1176+
Analyze a Document.
1177+
1178+
Process a document using the specified collection's settings and return it for
1179+
realtime use.
1180+
**Note:** Documents processed using this method are not added to the specified
1181+
collection.
1182+
**Note:** This method is only supported on IBM Cloud Pak for Data instances of
1183+
Discovery.
1184+
1185+
:param str project_id: The ID of the project. This information can be found
1186+
from the deploy page of the Discovery administrative tooling.
1187+
:param str collection_id: The ID of the collection.
1188+
:param TextIO file: (optional) The content of the document to ingest. The
1189+
maximum supported file size when adding a file to a collection is 50
1190+
megabytes, the maximum supported file size when testing a configuration is
1191+
1 megabyte. Files larger than the supported size are rejected.
1192+
:param str filename: (optional) The filename for file.
1193+
:param str file_content_type: (optional) The content type of file.
1194+
:param str metadata: (optional) The maximum supported metadata file size is
1195+
1 MB. Metadata parts larger than 1 MB are rejected.
1196+
Example: ``` {
1197+
"Creator": "Johnny Appleseed",
1198+
"Subject": "Apples"
1199+
} ```.
1200+
:param dict headers: A `dict` containing the request headers
1201+
:return: A `DetailedResponse` containing the result, headers and HTTP status code.
1202+
:rtype: DetailedResponse
1203+
"""
1204+
1205+
if project_id is None:
1206+
raise ValueError('project_id must be provided')
1207+
if collection_id is None:
1208+
raise ValueError('collection_id must be provided')
1209+
1210+
headers = {}
1211+
if 'headers' in kwargs:
1212+
headers.update(kwargs.get('headers'))
1213+
sdk_headers = get_sdk_headers(service_name=self.DEFAULT_SERVICE_NAME,
1214+
service_version='V2',
1215+
operation_id='analyze_document')
1216+
headers.update(sdk_headers)
1217+
1218+
params = {'version': self.version}
1219+
1220+
form_data = []
1221+
if file:
1222+
if not filename and hasattr(file, 'name'):
1223+
filename = basename(file.name)
1224+
if not filename:
1225+
raise ValueError('filename must be provided')
1226+
form_data.append(('file', (filename, file, file_content_type or
1227+
'application/octet-stream')))
1228+
if metadata:
1229+
metadata = str(metadata)
1230+
form_data.append(('metadata', (None, metadata, 'text/plain')))
1231+
1232+
url = '/v2/projects/{0}/collections/{1}/analyze'.format(
1233+
*self._encode_path_vars(project_id, collection_id))
1234+
request = self.prepare_request(method='POST',
1235+
url=url,
1236+
headers=headers,
1237+
params=params,
1238+
files=form_data)
1239+
1240+
response = self.send(request)
1241+
return response
1242+
11621243
#########################
11631244
# enrichments
11641245
#########################
@@ -1224,7 +1305,6 @@ def create_enrichment(self,
12241305
if enrichment is None:
12251306
raise ValueError('enrichment must be provided')
12261307

1227-
print(enrichment)
12281308
headers = {}
12291309
if 'headers' in kwargs:
12301310
headers.update(kwargs.get('headers'))
@@ -1662,11 +1742,173 @@ class FileContentType(Enum):
16621742
APPLICATION_XHTML_XML = 'application/xhtml+xml'
16631743

16641744

1745+
class AnalyzeDocumentEnums(object):
1746+
1747+
class FileContentType(Enum):
1748+
"""
1749+
The content type of file.
1750+
"""
1751+
APPLICATION_JSON = 'application/json'
1752+
APPLICATION_MSWORD = 'application/msword'
1753+
APPLICATION_VND_OPENXMLFORMATS_OFFICEDOCUMENT_WORDPROCESSINGML_DOCUMENT = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
1754+
APPLICATION_PDF = 'application/pdf'
1755+
TEXT_HTML = 'text/html'
1756+
APPLICATION_XHTML_XML = 'application/xhtml+xml'
1757+
1758+
16651759
##############################################################################
16661760
# Models
16671761
##############################################################################
16681762

16691763

1764+
class AnalyzedDocument():
1765+
"""
1766+
An object containing the converted document and any identifed enrichments.
1767+
1768+
:attr List[Notice] notices: (optional) Array of document results that match the
1769+
query.
1770+
:attr AnalyzedResult result: (optional) Result of the document analysis.
1771+
"""
1772+
1773+
def __init__(self,
1774+
*,
1775+
notices: List['Notice'] = None,
1776+
result: 'AnalyzedResult' = None) -> None:
1777+
"""
1778+
Initialize a AnalyzedDocument object.
1779+
1780+
:param List[Notice] notices: (optional) Array of document results that
1781+
match the query.
1782+
:param AnalyzedResult result: (optional) Result of the document analysis.
1783+
"""
1784+
self.notices = notices
1785+
self.result = result
1786+
1787+
@classmethod
1788+
def from_dict(cls, _dict: Dict) -> 'AnalyzedDocument':
1789+
"""Initialize a AnalyzedDocument object from a json dictionary."""
1790+
args = {}
1791+
valid_keys = ['notices', 'result']
1792+
bad_keys = set(_dict.keys()) - set(valid_keys)
1793+
if bad_keys:
1794+
raise ValueError(
1795+
'Unrecognized keys detected in dictionary for class AnalyzedDocument: '
1796+
+ ', '.join(bad_keys))
1797+
if 'notices' in _dict:
1798+
args['notices'] = [
1799+
Notice._from_dict(x) for x in (_dict.get('notices'))
1800+
]
1801+
if 'result' in _dict:
1802+
args['result'] = AnalyzedResult._from_dict(_dict.get('result'))
1803+
return cls(**args)
1804+
1805+
@classmethod
1806+
def _from_dict(cls, _dict):
1807+
"""Initialize a AnalyzedDocument object from a json dictionary."""
1808+
return cls.from_dict(_dict)
1809+
1810+
def to_dict(self) -> Dict:
1811+
"""Return a json dictionary representing this model."""
1812+
_dict = {}
1813+
if hasattr(self, 'notices') and self.notices is not None:
1814+
_dict['notices'] = [x._to_dict() for x in self.notices]
1815+
if hasattr(self, 'result') and self.result is not None:
1816+
_dict['result'] = self.result._to_dict()
1817+
return _dict
1818+
1819+
def _to_dict(self):
1820+
"""Return a json dictionary representing this model."""
1821+
return self.to_dict()
1822+
1823+
def __str__(self) -> str:
1824+
"""Return a `str` version of this AnalyzedDocument object."""
1825+
return json.dumps(self._to_dict(), indent=2)
1826+
1827+
def __eq__(self, other: 'AnalyzedDocument') -> bool:
1828+
"""Return `true` when self and other are equal, false otherwise."""
1829+
if not isinstance(other, self.__class__):
1830+
return False
1831+
return self.__dict__ == other.__dict__
1832+
1833+
def __ne__(self, other: 'AnalyzedDocument') -> bool:
1834+
"""Return `true` when self and other are not equal, false otherwise."""
1835+
return not self == other
1836+
1837+
1838+
class AnalyzedResult():
1839+
"""
1840+
Result of the document analysis.
1841+
1842+
:attr dict metadata: (optional) Metadata of the document.
1843+
"""
1844+
1845+
def __init__(self, *, metadata: dict = None, **kwargs) -> None:
1846+
"""
1847+
Initialize a AnalyzedResult object.
1848+
1849+
:param dict metadata: (optional) Metadata of the document.
1850+
:param **kwargs: (optional) Any additional properties.
1851+
"""
1852+
self.metadata = metadata
1853+
for _key, _value in kwargs.items():
1854+
setattr(self, _key, _value)
1855+
1856+
@classmethod
1857+
def from_dict(cls, _dict: Dict) -> 'AnalyzedResult':
1858+
"""Initialize a AnalyzedResult object from a json dictionary."""
1859+
args = {}
1860+
xtra = _dict.copy()
1861+
if 'metadata' in _dict:
1862+
args['metadata'] = _dict.get('metadata')
1863+
del xtra['metadata']
1864+
args.update(xtra)
1865+
return cls(**args)
1866+
1867+
@classmethod
1868+
def _from_dict(cls, _dict):
1869+
"""Initialize a AnalyzedResult object from a json dictionary."""
1870+
return cls.from_dict(_dict)
1871+
1872+
def to_dict(self) -> Dict:
1873+
"""Return a json dictionary representing this model."""
1874+
_dict = {}
1875+
if hasattr(self, 'metadata') and self.metadata is not None:
1876+
_dict['metadata'] = self.metadata
1877+
if hasattr(self, '_additionalProperties'):
1878+
for _key in self._additionalProperties:
1879+
_value = getattr(self, _key, None)
1880+
if _value is not None:
1881+
_dict[_key] = _value
1882+
return _dict
1883+
1884+
def _to_dict(self):
1885+
"""Return a json dictionary representing this model."""
1886+
return self.to_dict()
1887+
1888+
def __setattr__(self, name: str, value: object) -> None:
1889+
properties = {'metadata'}
1890+
if not hasattr(self, '_additionalProperties'):
1891+
super(AnalyzedResult, self).__setattr__('_additionalProperties',
1892+
set())
1893+
if name not in properties:
1894+
self._additionalProperties.add(name)
1895+
super(AnalyzedResult, self).__setattr__(name, value)
1896+
1897+
def __str__(self) -> str:
1898+
"""Return a `str` version of this AnalyzedResult object."""
1899+
return json.dumps(self._to_dict(), indent=2)
1900+
1901+
def __eq__(self, other: 'AnalyzedResult') -> bool:
1902+
"""Return `true` when self and other are equal, false otherwise."""
1903+
if not isinstance(other, self.__class__):
1904+
return False
1905+
return self.__dict__ == other.__dict__
1906+
1907+
def __ne__(self, other: 'AnalyzedResult') -> bool:
1908+
"""Return `true` when self and other are not equal, false otherwise."""
1909+
return not self == other
1910+
1911+
16701912
class Collection():
16711913
"""
16721914
A collection for storing documents.

test/integration/test_discovery_v2.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
# coding: utf-8
22
from unittest import TestCase
3-
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
3+
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator, BearerTokenAuthenticator
44
from ibm_watson.discovery_v2 import CreateEnrichment, EnrichmentOptions
5+
from os.path import abspath
56
import os
67
import ibm_watson
78
import pytest
@@ -108,3 +109,23 @@ def test_enrichments(self):
108109
self.project_id,
109110
enrichment_id
110111
)
112+
113+
# can only test in CPD
114+
def test_analyze(self):
115+
authenticator = BearerTokenAuthenticator('<bearer_token>')
116+
discovery_cpd = ibm_watson.DiscoveryV2(
117+
version='2020-08-12',
118+
authenticator=authenticator
119+
)
120+
discovery_cpd.service_url = "<url>"
121+
discovery_cpd.set_disable_ssl_verification(True)
122+
test_file = abspath('resources/problem.json')
123+
with open(test_file, 'rb') as file:
124+
result = discovery_cpd.analyze_document(
125+
project_id="<project_id>",
126+
collection_id="<collection_id>",
127+
file=file,
128+
file_content_type="application/json"
129+
).get_result()
130+
assert result is not None
131+

0 commit comments

Comments
 (0)