Skip to content

Commit b161be4

Browse files
First draft to fix estimation of file extension.
1 parent 74c977e commit b161be4

File tree

2 files changed

+177
-162
lines changed

2 files changed

+177
-162
lines changed

build.gradle

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ dependencies {
5454
implementation 'commons-io:commons-io:2.11.0'
5555
implementation 'javax.validation:validation-api:2.0.1.Final'
5656
implementation 'edu.kit.datamanager:service-base:1.0.4'
57+
// apache
58+
implementation "org.apache.tika:tika-core:2.7.0"
5759

5860
testImplementation platform('org.junit:junit-bom:5.9.0')
5961
testImplementation 'org.junit.jupiter:junit-jupiter:5.9.0'

src/main/java/edu/kit/datamanager/mappingservice/util/FileUtil.java

Lines changed: 175 additions & 162 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
* See the License for the specific language governing permissions and
1414
* limitations under the License.
1515
*/
16-
1716
package edu.kit.datamanager.mappingservice.util;
1817

1918
import edu.kit.datamanager.clients.SimpleServiceClient;
@@ -37,6 +36,10 @@
3736
import java.util.Optional;
3837
import java.util.regex.Matcher;
3938
import java.util.regex.Pattern;
39+
import org.apache.tika.Tika;
40+
import org.apache.tika.mime.MimeType;
41+
import org.apache.tika.mime.MimeTypeException;
42+
import org.apache.tika.mime.MimeTypes;
4043

4144
/**
4245
* Various utility methods for file handling.
@@ -46,174 +49,184 @@
4649
*/
4750
public class FileUtil {
4851

49-
/**
50-
* Default value for suffix of temporary files.
51-
*/
52-
public static final String DEFAULT_SUFFIX = ".tmp";
53-
/**
54-
* Default value for prefix of temporary files.
55-
*/
56-
public static final String DEFAULT_PREFIX = "MappingUtil_";
57-
/**
58-
* Logger for this class.
59-
*/
60-
private static final Logger LOGGER = LoggerFactory.getLogger(FileUtil.class);
61-
62-
private static final int MAX_LENGTH_OF_HEADER = 100;
63-
64-
private static final Pattern JSON_FIRST_BYTE = Pattern.compile("(\\R\\s)*\\s*\\{\\s*\"(.|\\s)*", Pattern.MULTILINE);//^\\s{\\s*\".*");
65-
private static final Pattern XML_FIRST_BYTE = Pattern.compile("((.|\\s)*<\\?xml[^<]*)?\\s*<\\s*(\\w+:)?\\w+(.|\\s)*", Pattern.MULTILINE);
66-
67-
/**
68-
* Downloads or copy the file behind the given URI and returns its path on
69-
* local disc. You should delete or move to another location afterwards.
70-
*
71-
* @param resourceURL the given URI
72-
* @return the path to the created file.
73-
*/
74-
public static Optional<Path> downloadResource(URI resourceURL) {
75-
String content;
76-
Path downloadedFile = null;
77-
try {
78-
if (resourceURL != null) {
79-
String suffix = FilenameUtils.getExtension(resourceURL.getPath());
80-
suffix = suffix.trim().isEmpty() ? DEFAULT_SUFFIX : "." + suffix;
81-
if (resourceURL.getHost() != null) {
82-
content = SimpleServiceClient
83-
.create(resourceURL.toString())
84-
.accept(MediaType.TEXT_PLAIN)
85-
.getResource(String.class);
86-
downloadedFile = createTempFile("download", suffix);
87-
FileUtils.writeStringToFile(downloadedFile.toFile(), content, StandardCharsets.UTF_8);
88-
} else {
89-
// copy local file to new place.
90-
File srcFile = new File(resourceURL.getPath());
91-
File destFile = FileUtil.createTempFile("local", suffix).toFile();
92-
FileUtils.copyFile(srcFile, destFile);
93-
downloadedFile = destFile.toPath();
94-
}
95-
}
96-
} catch (Throwable tw) {
97-
LOGGER.error("Error reading URI '" + resourceURL + "'", tw);
98-
throw new MappingException("Error downloading resource from '" + resourceURL + "'!", tw);
52+
/**
53+
* Default value for suffix of temporary files.
54+
*/
55+
public static final String DEFAULT_SUFFIX = ".tmp";
56+
/**
57+
* Default value for prefix of temporary files.
58+
*/
59+
public static final String DEFAULT_PREFIX = "MappingUtil_";
60+
/**
61+
* Logger for this class.
62+
*/
63+
private static final Logger LOGGER = LoggerFactory.getLogger(FileUtil.class);
64+
65+
private static final int MAX_LENGTH_OF_HEADER = 100;
66+
67+
private static final Pattern JSON_FIRST_BYTE = Pattern.compile("(\\R\\s)*\\s*\\{\\s*\"(.|\\s)*", Pattern.MULTILINE);//^\\s{\\s*\".*");
68+
private static final Pattern XML_FIRST_BYTE = Pattern.compile("((.|\\s)*<\\?xml[^<]*)?\\s*<\\s*(\\w+:)?\\w+(.|\\s)*", Pattern.MULTILINE);
69+
70+
/**
71+
* Downloads or copy the file behind the given URI and returns its path on
72+
* local disc. You should delete or move to another location afterwards.
73+
*
74+
* @param resourceURL the given URI
75+
* @return the path to the created file.
76+
*/
77+
public static Optional<Path> downloadResource(URI resourceURL) {
78+
String content;
79+
Path downloadedFile = null;
80+
try {
81+
if (resourceURL != null) {
82+
String suffix = FilenameUtils.getExtension(resourceURL.getPath());
83+
suffix = suffix.trim().isEmpty() ? DEFAULT_SUFFIX : "." + suffix;
84+
if (resourceURL.getHost() != null) {
85+
content = SimpleServiceClient
86+
.create(resourceURL.toString())
87+
.accept(MediaType.TEXT_PLAIN)
88+
.getResource(String.class);
89+
downloadedFile = createTempFile("download", suffix);
90+
FileUtils.writeStringToFile(downloadedFile.toFile(), content, StandardCharsets.UTF_8);
91+
} else {
92+
// copy local file to new place.
93+
File srcFile = new File(resourceURL.getPath());
94+
File destFile = FileUtil.createTempFile("local", suffix).toFile();
95+
FileUtils.copyFile(srcFile, destFile);
96+
downloadedFile = destFile.toPath();
9997
}
100-
downloadedFile = fixFileExtension(downloadedFile);
101-
102-
return Optional.ofNullable(downloadedFile);
98+
}
99+
} catch (Throwable tw) {
100+
LOGGER.error("Error reading URI '" + resourceURL + "'", tw);
101+
throw new MappingException("Error downloading resource from '" + resourceURL + "'!", tw);
103102
}
104-
105-
/**
106-
* Fix extension of file if possible.
107-
*
108-
* @param pathToFile the given URI
109-
* @return the path to the (renamed) file.
110-
*/
111-
public static Path fixFileExtension(Path pathToFile) {
112-
Path returnFile = pathToFile;
113-
Path renamedFile = pathToFile;
114-
try {
115-
if ((pathToFile != null) && (pathToFile.toFile().exists())) {
116-
String contentOfFile = FileUtils.readFileToString(pathToFile.toFile(), StandardCharsets.UTF_8);
117-
String newExtension = guessFileExtension(contentOfFile.getBytes());
118-
if (newExtension != null) {
119-
if (!pathToFile.toString().endsWith(newExtension)) {
120-
renamedFile = Paths.get(pathToFile + newExtension);
121-
FileUtils.moveFile(pathToFile.toFile(), renamedFile.toFile());
122-
returnFile = renamedFile;
123-
}
124-
}
125-
}
126-
} catch (IOException ex) {
127-
LOGGER.error("Error moving file '{}' to '{}'.", pathToFile, renamedFile);
103+
downloadedFile = fixFileExtension(downloadedFile);
104+
105+
return Optional.ofNullable(downloadedFile);
106+
}
107+
108+
/**
109+
* Fix extension of file if possible.
110+
*
111+
* @param pathToFile the given URI
112+
* @return the path to the (renamed) file.
113+
*/
114+
public static Path fixFileExtension(Path pathToFile) {
115+
Path returnFile = pathToFile;
116+
Path renamedFile = pathToFile;
117+
try {
118+
if ((pathToFile != null) && (pathToFile.toFile().exists())) {
119+
Tika tika = new Tika();
120+
String mimeType = tika.detect(pathToFile.toFile());
121+
MimeTypes allTypes = MimeTypes.getDefaultMimeTypes();
122+
MimeType estimatedMimeType = allTypes.forName(mimeType);
123+
String newExtension = estimatedMimeType.getExtension(); // .jpg
124+
125+
if (newExtension != null) {
126+
if (!pathToFile.toString().endsWith(newExtension)) {
127+
renamedFile = Paths.get(pathToFile + newExtension);
128+
FileUtils.moveFile(pathToFile.toFile(), renamedFile.toFile());
129+
returnFile = renamedFile;
130+
}
128131
}
129-
return returnFile;
132+
}
133+
} catch (IOException|MimeTypeException ex) {
134+
LOGGER.error("Error moving file '{}' to '{}'.", pathToFile, renamedFile);
130135
}
131-
132-
/**
133-
* Create temporary file. Attention: The file will not be removed
134-
* automatically.
135-
*
136-
* @param prefix prefix of the file
137-
* @param suffix suffix of the file
138-
* @return Path to file
139-
* @throws MappingException if an error occurs
140-
*/
141-
public static Path createTempFile(String prefix, String suffix) {
142-
Path tempFile;
143-
prefix = (prefix == null || prefix.trim().isEmpty()) ? DEFAULT_PREFIX : prefix;
144-
suffix = (suffix == null || suffix.trim().isEmpty() || suffix.trim().equals(".")) ? DEFAULT_SUFFIX : suffix;
145-
try {
146-
tempFile = Files.createTempFile(prefix, suffix);
147-
} catch (IOException ioe) {
148-
throw new MappingException("Error creating tmp file!", ioe);
149-
}
150-
return tempFile;
136+
return returnFile;
137+
}
138+
139+
/**
140+
* Create temporary file. Attention: The file will not be removed
141+
* automatically.
142+
*
143+
* @param prefix prefix of the file
144+
* @param suffix suffix of the file
145+
* @return Path to file
146+
* @throws MappingException if an error occurs
147+
*/
148+
public static Path createTempFile(String prefix, String suffix) {
149+
Path tempFile;
150+
prefix = (prefix == null || prefix.trim().isEmpty()) ? DEFAULT_PREFIX : prefix;
151+
suffix = (suffix == null || suffix.trim().isEmpty() || suffix.trim().equals(".")) ? DEFAULT_SUFFIX : suffix;
152+
try {
153+
tempFile = Files.createTempFile(prefix, suffix);
154+
} catch (IOException ioe) {
155+
throw new MappingException("Error creating tmp file!", ioe);
151156
}
152-
153-
/**
154-
* Remove temporary file.
155-
*
156-
* @param tempFile Path to file
157-
*/
158-
public static void removeFile(Path tempFile) {
159-
try {
160-
Files.deleteIfExists(tempFile);
161-
} catch (IOException ioe) {
162-
throw new MappingException("Error removing file '" + tempFile + "'!", ioe);
163-
}
157+
return tempFile;
158+
}
159+
160+
/**
161+
* Remove temporary file.
162+
*
163+
* @param tempFile Path to file
164+
*/
165+
public static void removeFile(Path tempFile) {
166+
try {
167+
Files.deleteIfExists(tempFile);
168+
} catch (IOException ioe) {
169+
throw new MappingException("Error removing file '" + tempFile + "'!", ioe);
164170
}
165-
166-
private static String guessFileExtension(byte[] schema) {
167-
// Cut schema to a maximum of MAX_LENGTH_OF_HEADER characters.
168-
int length = Math.min(schema.length, MAX_LENGTH_OF_HEADER);
169-
String schemaAsString = new String(schema, 0, length);
170-
LOGGER.trace("Guess type for '{}'", schemaAsString);
171-
172-
Matcher m = JSON_FIRST_BYTE.matcher(schemaAsString);
173-
if (m.matches()) {
174-
return ".json";
175-
} else {
176-
m = XML_FIRST_BYTE.matcher(schemaAsString);
177-
if (m.matches()) {
178-
return ".xml";
179-
}
180-
}
181-
return null;
171+
}
172+
173+
/**
174+
* Guess the extension of the file from the first bytes using Apache Tika
175+
*
176+
* @param schema First bytes of the file.
177+
* @return Estimated extension. e.g. '.xml'
178+
*/
179+
private static String guessFileExtension(byte[] schema) {
180+
// Cut schema to a maximum of MAX_LENGTH_OF_HEADER characters.
181+
int length = Math.min(schema.length, MAX_LENGTH_OF_HEADER);
182+
String schemaAsString = new String(schema, 0, length);
183+
LOGGER.trace("Guess type for '{}'", schemaAsString);
184+
185+
Matcher m = JSON_FIRST_BYTE.matcher(schemaAsString);
186+
if (m.matches()) {
187+
return ".json";
188+
} else {
189+
m = XML_FIRST_BYTE.matcher(schemaAsString);
190+
if (m.matches()) {
191+
return ".xml";
192+
}
182193
}
183-
184-
/**
185-
* This method clones a git repository into the 'lib' folder.
186-
*
187-
* @param repositoryUrl the url of the repository to clone
188-
* @param branch the branch to clone
189-
* @return the path to the cloned repository
190-
*/
191-
public static Path cloneGitRepository(String repositoryUrl, String branch) {
192-
String target = "lib/" + repositoryUrl.trim().replace("https://", "").replace("http://", "").replace(".git", "") + "_" + branch;
193-
return cloneGitRepository(repositoryUrl, branch, target);
194+
return null;
195+
}
196+
197+
/**
198+
* This method clones a git repository into the 'lib' folder.
199+
*
200+
* @param repositoryUrl the url of the repository to clone
201+
* @param branch the branch to clone
202+
* @return the path to the cloned repository
203+
*/
204+
public static Path cloneGitRepository(String repositoryUrl, String branch) {
205+
String target = "lib/" + repositoryUrl.trim().replace("https://", "").replace("http://", "").replace(".git", "") + "_" + branch;
206+
return cloneGitRepository(repositoryUrl, branch, target);
207+
}
208+
209+
/**
210+
* This method clones a git repository into the 'lib' folder.
211+
*
212+
* @param repositoryUrl the url of the repository to clone
213+
* @param branch the branch to clone
214+
* @param targetFolder the target folder
215+
* @return the path to the cloned repository
216+
*/
217+
public static Path cloneGitRepository(String repositoryUrl, String branch, String targetFolder) {
218+
File target = new File(targetFolder);
219+
target.mkdirs();
220+
221+
LOGGER.info("Cloning branch '{}' of repository '{}' to '{}'", branch, repositoryUrl, target.getPath());
222+
try {
223+
Git.cloneRepository().setURI(repositoryUrl).setBranch(branch).setDirectory(target).call();
224+
} catch (JGitInternalException e) {
225+
LOGGER.info(e.getMessage());
226+
} catch (GitAPIException ex) {
227+
throw new MappingException("Error cloning git repository '" + repositoryUrl + "' to '" + target + "'!", ex);
194228
}
195229

196-
/**
197-
* This method clones a git repository into the 'lib' folder.
198-
*
199-
* @param repositoryUrl the url of the repository to clone
200-
* @param branch the branch to clone
201-
* @param targetFolder the target folder
202-
* @return the path to the cloned repository
203-
*/
204-
public static Path cloneGitRepository(String repositoryUrl, String branch, String targetFolder) {
205-
File target = new File(targetFolder);
206-
target.mkdirs();
207-
208-
LOGGER.info("Cloning branch '{}' of repository '{}' to '{}'", branch, repositoryUrl, target.getPath());
209-
try {
210-
Git.cloneRepository().setURI(repositoryUrl).setBranch(branch).setDirectory(target).call();
211-
} catch (JGitInternalException e) {
212-
LOGGER.info(e.getMessage());
213-
} catch (GitAPIException ex) {
214-
throw new MappingException("Error cloning git repository '" + repositoryUrl + "' to '" + target + "'!", ex);
215-
}
216-
217-
return target.toPath();
218-
}
230+
return target.toPath();
231+
}
219232
}

0 commit comments

Comments
 (0)