Skip to content

Commit 8f58c47

Browse files
authored
feat: Keep file extension on deduplicated files (#6463)
fix #6461
1 parent 8dcd8aa commit 8f58c47

File tree

7 files changed

+78
-56
lines changed

7 files changed

+78
-56
lines changed

deltachat-ffi/deltachat.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4766,7 +4766,7 @@ void dc_msg_set_file (dc_msg_t* msg, const char* file,
47664766
* otherwise it will be copied to the blobdir first.
47674767
*
47684768
* In order to deduplicate files that contain the same data,
4769-
* the file will be named as a hash of the file data.
4769+
* the file will be named `<hash>.<extension>`, e.g. `ce940175885d7b78f7b7e9f1396611f.jpg`.
47704770
*
47714771
* NOTE:
47724772
* - This function will rename the file. To get the new file path, call `get_file()`.

src/blob.rs

Lines changed: 49 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -150,11 +150,16 @@ impl<'a> BlobObject<'a> {
150150
/// otherwise it will be copied to the blobdir first.
151151
///
152152
/// In order to deduplicate files that contain the same data,
153-
/// the file will be named as the hash of the file data.
153+
/// the file will be named `<hash>.<extension>`, e.g. `ce940175885d7b78f7b7e9f1396611f.jpg`.
154+
/// The `original_name` param is only used to get the extension.
154155
///
155156
/// This is done in a in way which avoids race-conditions when multiple files are
156157
/// concurrently created.
157-
pub fn create_and_deduplicate(context: &'a Context, src: &Path) -> Result<BlobObject<'a>> {
158+
pub fn create_and_deduplicate(
159+
context: &'a Context,
160+
src: &Path,
161+
original_name: &str,
162+
) -> Result<BlobObject<'a>> {
158163
// `create_and_deduplicate{_from_bytes}()` do blocking I/O, but can still be called
159164
// from an async context thanks to `block_in_place()`.
160165
// Tokio's "async" I/O functions are also just thin wrappers around the blocking I/O syscalls,
@@ -180,7 +185,25 @@ impl<'a> BlobObject<'a> {
180185
src_in_blobdir = &temp_path;
181186
}
182187

183-
let blob = BlobObject::from_hash(blobdir, file_hash(src_in_blobdir)?);
188+
let hash = file_hash(src_in_blobdir)?.to_hex();
189+
let hash = hash.as_str();
190+
let hash = hash.get(0..31).unwrap_or(hash);
191+
let new_file = if let Some(extension) = Path::new(original_name)
192+
.extension()
193+
.filter(|e| e.len() <= 32)
194+
{
195+
format!(
196+
"$BLOBDIR/{hash}.{}",
197+
extension.to_string_lossy().to_lowercase()
198+
)
199+
} else {
200+
format!("$BLOBDIR/{hash}")
201+
};
202+
203+
let blob = BlobObject {
204+
blobdir,
205+
name: new_file,
206+
};
184207
let new_path = blob.to_abs_path();
185208

186209
// This will also replace an already-existing file.
@@ -194,7 +217,8 @@ impl<'a> BlobObject<'a> {
194217

195218
/// Creates a new blob object with the file contents in `data`.
196219
/// In order to deduplicate files that contain the same data,
197-
/// the file will be renamed to a hash of the file data.
220+
/// the file will be named `<hash>.<extension>`, e.g. `ce940175885d7b78f7b7e9f1396611f.jpg`.
221+
/// The `original_name` param is only used to get the extension.
198222
///
199223
/// The `data` will be written into the file without race-conditions.
200224
///
@@ -203,6 +227,7 @@ impl<'a> BlobObject<'a> {
203227
pub fn create_and_deduplicate_from_bytes(
204228
context: &'a Context,
205229
data: &[u8],
230+
original_name: &str,
206231
) -> Result<BlobObject<'a>> {
207232
task::block_in_place(|| {
208233
let blobdir = context.get_blobdir();
@@ -213,20 +238,10 @@ impl<'a> BlobObject<'a> {
213238
std::fs::write(&temp_path, data).context("writing new blobfile failed")?;
214239
};
215240

216-
BlobObject::create_and_deduplicate(context, &temp_path)
241+
BlobObject::create_and_deduplicate(context, &temp_path, original_name)
217242
})
218243
}
219244

220-
fn from_hash(blobdir: &Path, hash: blake3::Hash) -> BlobObject<'_> {
221-
let hash = hash.to_hex();
222-
let hash = hash.as_str();
223-
let hash = hash.get(0..31).unwrap_or(hash);
224-
BlobObject {
225-
blobdir,
226-
name: format!("$BLOBDIR/{hash}"),
227-
}
228-
}
229-
230245
/// Creates a blob from a file, possibly copying it to the blobdir.
231246
///
232247
/// If the source file is not a path to into the blob directory
@@ -674,7 +689,7 @@ impl<'a> BlobObject<'a> {
674689
encode_img(&img, ofmt, &mut encoded)?;
675690
}
676691

677-
self.name = BlobObject::create_and_deduplicate_from_bytes(context, &encoded)
692+
self.name = BlobObject::create_and_deduplicate_from_bytes(context, &encoded, &name)
678693
.context("failed to write recoded blob to file")?
679694
.name;
680695
}
@@ -905,8 +920,11 @@ mod tests {
905920
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
906921
async fn test_lowercase_ext() {
907922
let t = TestContext::new().await;
908-
let blob = BlobObject::create(&t, "foo.TXT", b"hello").await.unwrap();
909-
assert_eq!(blob.as_name(), "$BLOBDIR/foo.txt");
923+
let blob = BlobObject::create_and_deduplicate_from_bytes(&t, b"hello", "foo.TXT").unwrap();
924+
assert!(
925+
blob.as_name().ends_with(".txt"),
926+
"Blob {blob:?} should end with .txt"
927+
);
910928
}
911929

912930
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
@@ -980,10 +998,10 @@ mod tests {
980998
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
981999
async fn test_create_long_names() {
9821000
let t = TestContext::new().await;
983-
let s = "1".repeat(150);
984-
let blob = BlobObject::create(&t, &s, b"data").await.unwrap();
1001+
let s = format!("file.{}", "a".repeat(100));
1002+
let blob = BlobObject::create_and_deduplicate_from_bytes(&t, b"data", &s).unwrap();
9851003
let blobname = blob.as_name().split('/').last().unwrap();
986-
assert!(blobname.len() < 128);
1004+
assert!(blobname.len() < 70);
9871005
}
9881006

9891007
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
@@ -1618,28 +1636,28 @@ mod tests {
16181636

16191637
let path = t.get_blobdir().join("anyfile.dat");
16201638
fs::write(&path, b"bla").await?;
1621-
let blob = BlobObject::create_and_deduplicate(&t, &path)?;
1622-
assert_eq!(blob.name, "$BLOBDIR/ce940175885d7b78f7b7e9f1396611f");
1639+
let blob = BlobObject::create_and_deduplicate(&t, &path, "anyfile.dat")?;
1640+
assert_eq!(blob.name, "$BLOBDIR/ce940175885d7b78f7b7e9f1396611f.dat");
16231641
assert_eq!(path.exists(), false);
16241642

16251643
assert_eq!(fs::read(&blob.to_abs_path()).await?, b"bla");
16261644

16271645
fs::write(&path, b"bla").await?;
1628-
let blob2 = BlobObject::create_and_deduplicate(&t, &path)?;
1646+
let blob2 = BlobObject::create_and_deduplicate(&t, &path, "anyfile.dat")?;
16291647
assert_eq!(blob2.name, blob.name);
16301648

16311649
let path_outside_blobdir = t.dir.path().join("anyfile.dat");
16321650
fs::write(&path_outside_blobdir, b"bla").await?;
1633-
let blob3 = BlobObject::create_and_deduplicate(&t, &path_outside_blobdir)?;
1651+
let blob3 = BlobObject::create_and_deduplicate(&t, &path_outside_blobdir, "anyfile.dat")?;
16341652
assert!(path_outside_blobdir.exists());
16351653
assert_eq!(blob3.name, blob.name);
16361654

16371655
fs::write(&path, b"blabla").await?;
1638-
let blob4 = BlobObject::create_and_deduplicate(&t, &path)?;
1656+
let blob4 = BlobObject::create_and_deduplicate(&t, &path, "anyfile.dat")?;
16391657
assert_ne!(blob4.name, blob.name);
16401658

16411659
fs::remove_dir_all(t.get_blobdir()).await?;
1642-
let blob5 = BlobObject::create_and_deduplicate(&t, &path_outside_blobdir)?;
1660+
let blob5 = BlobObject::create_and_deduplicate(&t, &path_outside_blobdir, "anyfile.dat")?;
16431661
assert_eq!(blob5.name, blob.name);
16441662

16451663
Ok(())
@@ -1650,7 +1668,7 @@ mod tests {
16501668
let t = TestContext::new().await;
16511669

16521670
fs::remove_dir(t.get_blobdir()).await?;
1653-
let blob = BlobObject::create_and_deduplicate_from_bytes(&t, b"bla")?;
1671+
let blob = BlobObject::create_and_deduplicate_from_bytes(&t, b"bla", "file")?;
16541672
assert_eq!(blob.name, "$BLOBDIR/ce940175885d7b78f7b7e9f1396611f");
16551673

16561674
assert_eq!(fs::read(&blob.to_abs_path()).await?, b"bla");
@@ -1662,7 +1680,7 @@ mod tests {
16621680
// which we can't mock from our code.
16631681
tokio::time::sleep(Duration::from_millis(1100)).await;
16641682

1665-
let blob2 = BlobObject::create_and_deduplicate_from_bytes(&t, b"bla")?;
1683+
let blob2 = BlobObject::create_and_deduplicate_from_bytes(&t, b"bla", "file")?;
16661684
assert_eq!(blob2.name, blob.name);
16671685

16681686
let modified2 = blob.to_abs_path().metadata()?.modified()?;
@@ -1675,15 +1693,15 @@ mod tests {
16751693
sql::housekeeping(&t).await?;
16761694
assert_eq!(blob2.to_abs_path().exists(), false);
16771695

1678-
let blob3 = BlobObject::create_and_deduplicate_from_bytes(&t, b"blabla")?;
1696+
let blob3 = BlobObject::create_and_deduplicate_from_bytes(&t, b"blabla", "file")?;
16791697
assert_ne!(blob3.name, blob.name);
16801698

16811699
{
16821700
// If something goes wrong and the blob file is overwritten,
16831701
// the correct content should be restored:
16841702
fs::write(blob3.to_abs_path(), b"bloblo").await?;
16851703

1686-
let blob4 = BlobObject::create_and_deduplicate_from_bytes(&t, b"blabla")?;
1704+
let blob4 = BlobObject::create_and_deduplicate_from_bytes(&t, b"blabla", "file")?;
16871705
let blob4_content = fs::read(blob4.to_abs_path()).await?;
16881706
assert_eq!(blob4_content, b"blabla");
16891707
}

src/chat/chat_tests.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2832,7 +2832,7 @@ async fn test_blob_renaming() -> Result<()> {
28322832

28332833
// the file bob receives should not contain BIDI-control characters
28342834
assert_eq!(
2835-
Some("$BLOBDIR/30c0f9c6a167fc2a91285c85be7ea34"),
2835+
Some("$BLOBDIR/30c0f9c6a167fc2a91285c85be7ea34.exe"),
28362836
msg.param.get(Param::File),
28372837
);
28382838
assert_eq!(

src/imex/transfer.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -431,7 +431,7 @@ mod tests {
431431
let path = msg.get_file(&ctx1).unwrap();
432432
assert_eq!(
433433
// That's the hash of the file:
434-
path.with_file_name("ac1d2d284757656a8d41dc40aae4136"),
434+
path.with_file_name("ac1d2d284757656a8d41dc40aae4136.txt"),
435435
path
436436
);
437437
assert_eq!("hello.txt", msg.get_filename().unwrap());

src/message.rs

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
//! # Messages and their identifiers.
22
33
use std::collections::BTreeSet;
4-
use std::ffi::OsStr;
54
use std::path::{Path, PathBuf};
65
use std::str;
76

@@ -1095,7 +1094,7 @@ impl Message {
10951094
/// otherwise it will be copied to the blobdir first.
10961095
///
10971096
/// In order to deduplicate files that contain the same data,
1098-
/// the file will be named as a hash of the file data.
1097+
/// the file will be named `<hash>.<extension>`, e.g. `ce940175885d7b78f7b7e9f1396611f.jpg`.
10991098
///
11001099
/// NOTE:
11011100
/// - This function will rename the file. To get the new file path, call `get_file()`.
@@ -1107,14 +1106,18 @@ impl Message {
11071106
name: Option<&str>,
11081107
filemime: Option<&str>,
11091108
) -> Result<()> {
1110-
let blob = BlobObject::create_and_deduplicate(context, file)?;
1111-
if let Some(name) = name {
1112-
self.param.set(Param::Filename, name);
1109+
let name = if let Some(name) = name {
1110+
name.to_string()
11131111
} else {
1114-
let file_name = file.file_name().map(OsStr::to_string_lossy);
1115-
self.param.set_optional(Param::Filename, file_name);
1116-
}
1112+
file.file_name()
1113+
.map(|s| s.to_string_lossy().to_string())
1114+
.unwrap_or_else(|| "unknown_file".to_string())
1115+
};
1116+
1117+
let blob = BlobObject::create_and_deduplicate(context, file, &name)?;
11171118
self.param.set(Param::File, blob.as_name());
1119+
1120+
self.param.set(Param::Filename, name);
11181121
self.param.set_optional(Param::MimeType, filemime);
11191122

11201123
Ok(())
@@ -1123,7 +1126,7 @@ impl Message {
11231126
/// Creates a new blob and sets it as a file associated with a message.
11241127
///
11251128
/// In order to deduplicate files that contain the same data,
1126-
/// the filename will be a hash of the file data.
1129+
/// the file will be named `<hash>.<extension>`, e.g. `ce940175885d7b78f7b7e9f1396611f.jpg`.
11271130
///
11281131
/// NOTE: The file must not be modified after this function was called.
11291132
pub fn set_file_from_bytes(
@@ -1133,7 +1136,7 @@ impl Message {
11331136
data: &[u8],
11341137
filemime: Option<&str>,
11351138
) -> Result<()> {
1136-
let blob = BlobObject::create_and_deduplicate_from_bytes(context, data)?;
1139+
let blob = BlobObject::create_and_deduplicate_from_bytes(context, data, name)?;
11371140
self.param.set(Param::Filename, name);
11381141
self.param.set(Param::File, blob.as_name());
11391142
self.param.set_optional(Param::MimeType, filemime);

src/mimeparser.rs

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1384,16 +1384,17 @@ impl MimeMessage {
13841384
/* we have a regular file attachment,
13851385
write decoded data to new blob object */
13861386

1387-
let blob = match BlobObject::create_and_deduplicate_from_bytes(context, decoded_data) {
1388-
Ok(blob) => blob,
1389-
Err(err) => {
1390-
error!(
1391-
context,
1392-
"Could not add blob for mime part {}, error {:#}", filename, err
1393-
);
1394-
return Ok(());
1395-
}
1396-
};
1387+
let blob =
1388+
match BlobObject::create_and_deduplicate_from_bytes(context, decoded_data, filename) {
1389+
Ok(blob) => blob,
1390+
Err(err) => {
1391+
error!(
1392+
context,
1393+
"Could not add blob for mime part {}, error {:#}", filename, err
1394+
);
1395+
return Ok(());
1396+
}
1397+
};
13971398
info!(context, "added blobfile: {:?}", blob.as_name());
13981399

13991400
if mime_type.type_() == mime::IMAGE {

src/receive_imf/receive_imf_tests.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1667,7 +1667,7 @@ async fn test_pdf_filename_simple() {
16671667
assert_eq!(
16681668
file_path,
16691669
// That's the blake3 hash of the file content:
1670-
"$BLOBDIR/24a6af459cec5d733374aeaa19a6133"
1670+
"$BLOBDIR/24a6af459cec5d733374aeaa19a6133.pdf"
16711671
);
16721672
assert_eq!(msg.param.get(Param::Filename).unwrap(), "simple.pdf");
16731673
}
@@ -3268,7 +3268,7 @@ async fn test_weird_and_duplicated_filenames() -> Result<()> {
32683268
msg.save_file(t, &path2).await.unwrap();
32693269
assert_eq!(
32703270
path.file_name().unwrap().to_str().unwrap(),
3271-
"79402cb76f44c5761888f9036992a76",
3271+
"79402cb76f44c5761888f9036992a76.gz",
32723272
"The hash of the content should always be the same"
32733273
);
32743274
assert_eq!(fs::read_to_string(&path).await.unwrap(), content);

0 commit comments

Comments
 (0)