commit 62bc388c6cfcf5910c2683a487e175a19ac24ca9 Author: Mickaƫl Schoentgen Date: Thu Feb 18 14:56:00 2021 +0100 NXDRIVE-2527: Use files size for large files local changes detection From now on, "large" files changes detection will be based on their size and not on their checksum. diff --git a/docs/changes/5.0.0.md b/docs/changes/5.0.0.md index 67168171..64f591e3 100644 --- a/docs/changes/5.0.0.md +++ b/docs/changes/5.0.0.md @@ -16,6 +16,7 @@ Release date: `2021-xx-xx` - [NXDRIVE-2518](https://jira.nuxeo.com/browse/NXDRIVE-2518): Drop support for macOS 10.12 (Sierra) - [NXDRIVE-2524](https://jira.nuxeo.com/browse/NXDRIVE-2524): [GNU/Linux] WAL journal mode cause manager database corruption - [NXDRIVE-2525](https://jira.nuxeo.com/browse/NXDRIVE-2525): Do not try to list parts when an upload was already completed on S3 +- [NXDRIVE-2527](https://jira.nuxeo.com/browse/NXDRIVE-2527): Use files size for large files local changes detection ### Direct Edit @@ -84,4 +85,5 @@ Release date: `2021-xx-xx` - Removed logging_config.py::`update_logger_file()` - Added utils.py::`client_certificate() - Added utils.py::`digest_status()` +- Added utils.py::`is_large_file()` - Changed the return type of utils.py::`test_url()` from `bool` to `str` diff --git a/nxdrive/engine/dao/sqlite.py b/nxdrive/engine/dao/sqlite.py index 9070636d..b1e51c6b 100644 --- a/nxdrive/engine/dao/sqlite.py +++ b/nxdrive/engine/dao/sqlite.py @@ -56,9 +56,8 @@ Session, Upload, ) -from ...options import Options from ...qt.imports import QObject, pyqtSignal -from ...utils import current_thread_id +from ...utils import current_thread_id, is_large_file from .utils import fix_db, restore_backup, save_backup if TYPE_CHECKING: @@ -1202,7 +1201,7 @@ def insert_local_state( ) -> int: digest = None if not info.folderish: - if info.size >= Options.big_file * 1024 * 1024: + if is_large_file(info.size): # We can't compute the digest of big files now as it will # be done later when the entire file is fully copied. # For instance, on my machine (32GB RAM, 8 cores, Intel NUC) diff --git a/nxdrive/engine/watcher/local_watcher.py b/nxdrive/engine/watcher/local_watcher.py index 91c58d77..1c9ded7b 100644 --- a/nxdrive/engine/watcher/local_watcher.py +++ b/nxdrive/engine/watcher/local_watcher.py @@ -19,7 +19,12 @@ from ...objects import DocPair, Metrics from ...options import Options from ...qt.imports import pyqtSignal -from ...utils import current_milli_time, force_decode, is_generated_tmp_file +from ...utils import ( + current_milli_time, + force_decode, + is_generated_tmp_file, + is_large_file, +) from ...utils import normalize_event_filename as normalize from ..activity import tooltip from ..workers import EngineWorker, Worker @@ -52,6 +57,22 @@ def is_text_edit_tmp_file(name: str, /) -> bool: return bool(re.match(TEXT_EDIT_TMP_FILE_PATTERN, name)) +def _file_changed(local_file: FileInfo, doc_pair: DocPair) -> bool: + """Check if a file has changed. If yes, update attribute to reflect the change.""" + if is_large_file(local_file.size): + # For large files, we check the file size because computing their digest would be too expensive + digest = UNACCESSIBLE_HASH + changed = doc_pair.size != local_file.size + else: + # For smaller files, we can check their digest + digest = local_file.get_digest() + changed = doc_pair.local_digest != digest + + if changed: + doc_pair.local_digest = digest + return changed + + class LocalWatcher(EngineWorker): localScanFinished = pyqtSignal() rootMoved = pyqtSignal(Path) @@ -476,21 +497,21 @@ def _scan_recursive(self, info: FileInfo, /, *, recursive: bool = True) -> None: ) if old_pair is not None: old_pair.local_state = "moved" - # Check digest also - digest = child_info.get_digest() - if old_pair.local_digest != digest: - old_pair.local_digest = digest + if not child_info.folderish: + # The digest or size will be updated in old_pair if it changed + _file_changed(child_info, old_pair) dao.update_local_state( old_pair, client.get_info(doc_pair.local_path) ) self._protected_files[old_pair.remote_ref] = True + doc_pair.local_state = "moved" - # Check digest also - digest = child_info.get_digest() - if doc_pair.local_digest != digest: - doc_pair.local_digest = digest + if not child_info.folderish: + # The digest or size will be updated in old_pair if it changed + _file_changed(child_info, old_pair) dao.update_local_state(doc_pair, child_info) self._protected_files[doc_pair.remote_ref] = True + if child_info.folderish: to_scan_new.append(child_info) except ThreadInterrupt: @@ -539,13 +560,11 @@ def _scan_recursive(self, info: FileInfo, /, *, recursive: bool = True) -> None: f"({remote_ref}/{child_pair.remote_ref})" ) if not remote_ref: - if not child_info.folderish: - # Alternative stream or xattr can have - # been removed by external software or user - digest = child_info.get_digest() - if child_pair.local_digest != digest: - child_pair.local_digest = digest - child_pair.local_state = "modified" + # Alternative stream or xattr can have been removed by external software or user + if not child_info.folderish and _file_changed( + child_info, child_pair + ): + child_pair.local_state = "modified" """ NXDRIVE-668: Here we might be in the case @@ -575,18 +594,16 @@ def _scan_recursive(self, info: FileInfo, /, *, recursive: bool = True) -> None: dao.insert_local_state(child_info, info.path) else: old_pair.local_state = "moved" - # Check digest also - digest = child_info.get_digest() - if old_pair.local_digest != digest: - old_pair.local_digest = digest + if not child_info.folderish: + # The digest or size will be updated in old_pair if it changed + _file_changed(child_info, old_pair) dao.update_local_state(old_pair, child_info) self._protected_files[old_pair.remote_ref] = True self._delete_files[child_pair.remote_ref] = child_pair - if not child_info.folderish: - digest = child_info.get_digest() - if child_pair.local_digest != digest: - child_pair.local_digest = digest - child_pair.local_state = "modified" + if not child_info.folderish and _file_changed( + child_info, child_pair + ): + child_pair.local_state = "modified" self._metrics["update_files"] += 1 dao.update_local_state(child_pair, child_info) if child_info.folderish: @@ -730,13 +747,12 @@ def _handle_move_on_known_pair( if pair and pair.remote_ref == remote_ref: local_info = client.try_get_info(rel_path) if local_info: - digest = local_info.get_digest() # Drop event if digest hasn't changed, can be the case # if only file permissions have been updated - if not doc_pair.folderish and pair.local_digest == digest: + if not doc_pair.folderish and not _file_changed(local_info, pair): log.debug( f"Dropping watchdog event [{evt.event_type}] as digest " - f"has not changed for {rel_path!r}" + f"(or size) has not changed for {rel_path!r}" ) # If pair are the same don't drop it. It can happen # in case of server rename on a document. @@ -744,7 +760,6 @@ def _handle_move_on_known_pair( dao.remove_state(doc_pair) return - pair.local_digest = digest pair.local_state = "modified" dao.update_local_state(pair, local_info) dao.remove_state(doc_pair) @@ -879,12 +894,12 @@ def _handle_watchdog_event_on_known_acquired_pair( "Created event on a known pair with no remote_ref, this should " f"only happen in case of a quick move and copy-paste: {doc_pair!r}" ) - if local_info.get_digest() == doc_pair.local_digest: + if not _file_changed(local_info, doc_pair): return log.info( "Created event on a known pair with no remote_ref " - f"but with different digest: {doc_pair!r}" + f"but with different digest (or size): {doc_pair!r}" ) else: # NXDRIVE-509 @@ -898,20 +913,14 @@ def _handle_watchdog_event_on_known_acquired_pair( dao.update_local_modification_time(doc_pair, local_info) return - # We can't allow this branch to be taken for big files - # because computing their digest will explode everything. - # This code is taken _a lot_ when copying big files, so it - # makes sens to bypass this check. - if ( - local_info.size < Options.big_file * 1024 * 1024 - and doc_pair.pair_state == "synchronized" - ): - digest = local_info.get_digest() - # Unchanged digest, can be the case if only the last - # modification time or file permissions have been updated - if doc_pair.local_digest == digest: + if doc_pair.pair_state == "synchronized": + if _file_changed(local_info, doc_pair): + log.debug("Digest (or size) has changed") + doc_pair.local_state = "modified" + else: + # Can be the case if only the last modification time or file permissions has been updated log.info( - f"Digest has not changed for {rel_path!r} (watchdog event " + f"Digest (or size) has not changed for {rel_path!r} (watchdog event " f"[{evt.event_type}]), only update last_local_updated" ) if not local_info.remote_ref and doc_pair.remote_ref: @@ -919,9 +928,6 @@ def _handle_watchdog_event_on_known_acquired_pair( dao.update_local_modification_time(doc_pair, local_info) return - doc_pair.local_digest = digest - doc_pair.local_state = "modified" - if evt.event_type == "modified": # Handle files that take some time to be fully copied ongoing_copy = False diff --git a/nxdrive/utils.py b/nxdrive/utils.py index c46cb852..c2a0b861 100644 --- a/nxdrive/utils.py +++ b/nxdrive/utils.py @@ -458,6 +458,11 @@ def is_generated_tmp_file(name: str, /) -> Tuple[bool, Optional[bool]]: return do_not_ignore, no_delay_effect +def is_large_file(size: int) -> bool: + """Return True is the given file *size* is considered large.""" + return size >= Options.big_file * 1024 * 1024 + + def normalized_path(path: Union[bytes, str, Path], /) -> Path: """Return absolute, normalized file path.