-
Notifications
You must be signed in to change notification settings - Fork 111
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
can deletions trigger automatic removal of downstream files? #7593
Comments
In your case, I guess by It might be something of interest for https://github.com/datalad/datalad-remake by @mih which aims to produce feature for re-generation of files from |
I actually mean Just dropping this here in case anyone is interested. It's a script that scans through the git history for the main branch and determines which files depend on others (input/output) and also determines which files are no longer valid, because there inputs have either been removed or replaced. (sorry it may be incomprehensible... it's for reference) import json
# pip install gitpython
from git import Repo
def get_datalad_commits(repo_path, branch='main'):
# Load the repository from the given path
repo = Repo(repo_path)
# Check out the branch
assert not repo.bare
repo.git.checkout(branch)
# Get all commits in the branch
commits = list(repo.iter_commits(branch))
datalad_commits = []
for commit in commits:
message = commit.message.strip()
if isinstance(message, bytes):
try:
message = message.decode('utf-8')
except UnicodeDecodeError:
print(f'WARNING: Unable to decode commit message: {message}')
continue
assert isinstance(message, str)
if message.startswith('[DATALAD'):
lines = message.split('\n')
if len(lines) == 0:
continue
in_json = False
json_lines = []
for line in lines:
if line.startswith('=== '):
in_json = True
elif line.startswith('^^^ '):
in_json = False
elif in_json:
json_lines.append(line)
if len(json_lines) > 0:
json_text = '\n'.join(json_lines)
record = json.loads(json_text)
else:
record = {}
# Checking for removed files
removed_files = []
if len(commit.parents) > 0:
for diff in commit.parents[0].diff(commit, create_patch=True):
if diff.deleted_file:
removed_files.append(diff.a_path)
datalad_commits.append({
'record': record,
'message': lines[0],
'removed_files': removed_files
})
# reverse the order of commits
datalad_commits.reverse()
return datalad_commits
# Specify the path to your repository
repo_path = '.'
# Get all commit messages from the main branch
datalad_commits = get_datalad_commits(repo_path, 'main')
# for commit in datalad_commits:
# print(commit['message'])
# print(commit['record'])
# if len(commit['removed_files']) > 0:
# print('Removed files:')
# for removed_file in commit['removed_files']:
# print(f' {removed_file}')
# print('')
existing_outputs = {}
valid_files_with_dependencies = {}
for commit in datalad_commits:
record = commit['record']
removed_files = [r for r in commit['removed_files']]
# add existing outputs to the list of removed files
if 'outputs' in record:
for output in record['outputs']:
if output in valid_files_with_dependencies:
removed_files.extend(output)
# remove the removed files
for removed_file in removed_files:
if removed_file in valid_files_with_dependencies:
del valid_files_with_dependencies[removed_file]
if removed_file in existing_outputs:
del existing_outputs[removed_file]
# now invalidate all files with missing dependencies because no longer valid
something_changed = True
while something_changed:
to_remove = []
for file, v in valid_files_with_dependencies.items():
dependencies = v['dependencies']
for dep in dependencies:
if dep not in valid_files_with_dependencies:
to_remove.append(file)
break
something_changed = False
for file in to_remove:
del valid_files_with_dependencies[file]
something_changed = True
# add the new outputs
if 'outputs' in record:
for output in record['outputs']:
dependencies = record.get('inputs', [])
if 'extra_inputs' in record:
dependencies.extend(record['extra_inputs'])
valid_files_with_dependencies[output] = {
'dependencies': dependencies
}
existing_outputs[output] = True
print('Files with dependencies:')
for file, v in valid_files_with_dependencies.items():
dependencies = v['dependencies']
dependencies.sort()
print(f' {file}:')
for dep in dependencies:
print(f' {dep}')
# collect invalid outputs
invalid_outputs = []
for file in existing_outputs.keys():
if file not in valid_files_with_dependencies:
invalid_outputs.append(file)
invalid_outputs.sort()
print('Invalid outputs:')
for file in invalid_outputs:
print(f' {file}') Sample output for a project I am working on
|
Maybe this feature already exists? @yarikoptic
It would be really nice if a deletion of a file would automatically cascade a deletion of downstream files based on the --inputs and --outputs of runs. And it would be great if all these deletions could occur in a single atomic commit.
Then...
The text was updated successfully, but these errors were encountered: