From 7afd869854bc3d5df403e63f243cf7f8f37301c7 Mon Sep 17 00:00:00 2001 From: Brian Warner Date: Wed, 21 May 2008 11:49:22 -0700 Subject: [PATCH] cli: initial implementation of 'cp -r', probably doesn't work yet --- src/allmydata/scripts/tahoe_cp.py | 677 +++++++++++++++++++++++++----- src/allmydata/test/test_system.py | 29 +- 2 files changed, 593 insertions(+), 113 deletions(-) diff --git a/src/allmydata/scripts/tahoe_cp.py b/src/allmydata/scripts/tahoe_cp.py index b904b179..661272af 100644 --- a/src/allmydata/scripts/tahoe_cp.py +++ b/src/allmydata/scripts/tahoe_cp.py @@ -4,142 +4,597 @@ import urllib import simplejson from allmydata.scripts.common import get_alias, escape_path, DefaultAliasMarker from allmydata.scripts.common_http import do_http +from allmydata import uri def ascii_or_none(s): if s is None: return s return str(s) -def get_info(nodeurl, aliases, target): - rootcap, path = get_alias(aliases, target, None) - if rootcap == DefaultAliasMarker: - # this is a local file - pathname = os.path.abspath(os.path.expanduser(path)) - if not os.path.exists(pathname): - return ("empty", "local", pathname) - if os.path.isdir(pathname): - return ("directory", "local", pathname) +class WriteError(Exception): + pass +class ReadError(Exception): + pass + +def GET_to_file(url): + resp = do_http("GET", url) + if resp.status == 200: + return resp + raise ReadError("Error during GET: %s %s %s" % (resp.status, + resp.reason, + resp.read())) +def GET_to_string(url): + f = GET_to_file(url) + return f.read() + +def PUT(url, data): + resp = do_http("PUT", url, data) + if resp.status in (200, 201): + return resp.read() + raise WriteError("Error during PUT: %s %s %s" % (resp.status, resp.reason, + resp.read())) + +def mkdir(targeturl): + resp = do_http("POST", targeturl) + if resp.status in (200, 201): + return resp.read().strip() + raise WriteError("Error during mkdir: %s %s %s" % (resp.status, resp.reason, + resp.read())) + +def make_tahoe_subdirectory(nodeurl, parent_writecap, name): + url = nodeurl + "/".join(["uri", + urllib.quote(parent_writecap), + urllib.quote(name), + ]) + "?t=mkdir" + resp = do_http("POST", url) + if resp.status in (200, 201): + return resp.read().strip() + raise WriteError("Error during mkdir: %s %s %s" % (resp.status, resp.reason, + resp.read())) + + +class LocalFileSource: + def __init__(self, pathname): + self.pathname = pathname + + def need_to_copy_bytes(self): + return True + + def open(self): + return open(self.pathname, "rb") + +class LocalFileTarget: + def __init__(self, pathname): + self.pathname = pathname + +class LocalDirectorySource: + def __init__(self, progressfunc, pathname): + self.progressfunc = progressfunc + self.pathname = pathname + self.children = None + + def populate(self, recurse): + children = os.listdir(self.pathname) + for i,n in enumerate(children): + self.progressfunc("examining %d of %d" % (i, len(children))) + pn = os.path.join(self.pathname, n) + if os.path.isdir(pn): + child = LocalDirectorySource(self.progressfunc, pn) + self.children[n] = child + if recurse: + child.populate(True) + else: + assert os.path.isfile(pn) + self.children[n] = LocalFileSource(pn) + +class LocalDirectoryTarget: + def __init__(self, progressfunc, pathname): + self.progressfunc = progressfunc + self.pathname = pathname + self.children = None + + def populate(self, recurse): + children = os.listdir(self.pathname) + for i,n in enumerate(children): + self.progressfunc("examining %d of %d" % (i, len(children))) + pn = os.path.join(self.pathname, n) + if os.path.isdir(pn): + child = LocalDirectoryTarget(self.progressfunc, pn) + self.children[n] = child + if recurse: + child.populate(True) + else: + assert os.path.isfile(pn) + self.children[n] = LocalFileTarget(pn) + + def get_child_target(self, name): + if self.children is None: + self.populate(False) + if name in self.children: + return self.children[name] + pathname = os.path.join(self.pathname, name) + os.makedirs(pathname) + return LocalDirectoryTarget(self.progressfunc, pathname) + + def put_file(self, name, inf): + pathname = os.path.join(self.pathname, name) + outf = open(pathname, "wb") + while True: + data = inf.read(32768) + if not data: + break + outf.write(data) + outf.close() + + def set_children(self): + pass + +class TahoeFileSource: + def __init__(self, nodeurl, mutable, writecap, readcap): + self.nodeurl = nodeurl + self.mutable = mutable + self.writecap = writecap + self.readcap = readcap + + def need_to_copy_bytes(self): + if self.mutable: + return True + return False + + def open(self): + url = self.nodeurl + "uri/" + urllib.quote(self.readcap) + return GET_to_file(url) + + def bestcap(self): + return self.writecap or self.readcap + +class TahoeFileTarget: + def __init__(self, nodeurl, mutable, writecap, readcap): + self.nodeurl = nodeurl + self.mutable = mutable + self.writecap = writecap + self.readcap = readcap + +class TahoeDirectorySource: + def __init__(self, nodeurl, cache, progressfunc): + self.nodeurl = nodeurl + self.cache = cache + self.progressfunc = progressfunc + + def init_from_grid(self, writecap, readcap): + self.writecap = writecap + self.readcap = readcap + bestcap = writecap or readcap + url = self.nodeurl + "uri/%s" % urllib.quote(bestcap) + resp = do_http("GET", url + "?t=json") + assert resp.status == 200 + parsed = simplejson.loads(resp.read()) + nodetype, d = parsed + assert nodetype == "dirnode" + self.mutable = d.get("mutable", False) # older nodes don't provide it + self.children_d = d["children"] + self.children = None + + def populate(self, recurse): + self.children = {} + for i,(name, data) in enumerate(self.children_d): + self.progressfunc("examining %d of %d" % (i, len(self.children_d))) + if data[0] == "filenode": + mutable = data[1].get("mutable", False) + writecap = ascii_or_none(data[1].get("rw_uri")) + readcap = ascii_or_none(data[1].get("ro_uri")) + self.children[name] = TahoeFileSource(self.nodeurl, mutable, + writecap, readcap) + else: + assert data[0] == "dirnode" + writecap = ascii_or_none(data[1].get("rw_uri")) + readcap = ascii_or_none(data[1].get("ro_uri")) + if writecap and writecap in self.cache: + child = self.cache[writecap] + elif readcap and readcap in self.cache: + child = self.cache[readcap] + else: + child = TahoeDirectorySource(self.nodeurl, self.cache, + self.progressfunc) + child.init_from_grid(writecap, readcap) + if writecap: + self.cache[writecap] = child + if readcap: + self.cache[readcap] = child + if recurse: + child.populate(True) + self.children[name] = child + +class TahoeDirectoryTarget: + def __init__(self, nodeurl, cache, progressfunc): + self.nodeurl = nodeurl + self.cache = cache + self.progressfunc = progressfunc + self.new_children = {} + + def init_from_grid(self, writecap, readcap): + self.writecap = writecap + self.readcap = readcap + bestcap = writecap or readcap + url = self.nodeurl + "uri/%s" % urllib.quote(bestcap) + resp = do_http("GET", url + "?t=json") + assert resp.status == 200 + parsed = simplejson.loads(resp.read()) + nodetype, d = parsed + assert nodetype == "dirnode" + self.mutable = d.get("mutable", False) # older nodes don't provide it + self.children_d = d["children"] + self.children = None + + def just_created(self, writecap): + self.writecap = writecap + self.readcap = uri.from_string().get_readonly().to_string() + self.mutable = True + self.children_d = {} + self.children = {} + + def populate(self, recurse): + self.children = {} + for i,(name, data) in enumerate(self.children_d): + self.progressfunc("examining %d of %d" % (i, len(self.children_d))) + if data[0] == "filenode": + mutable = data[1].get("mutable", False) + writecap = ascii_or_none(data[1].get("rw_uri")) + readcap = ascii_or_none(data[1].get("ro_uri")) + self.children[name] = TahoeFileTarget(self.nodeurl, mutable, + writecap, readcap) + else: + assert data[0] == "dirnode" + writecap = ascii_or_none(data[1].get("rw_uri")) + readcap = ascii_or_none(data[1].get("ro_uri")) + if writecap and writecap in self.cache: + child = self.cache[writecap] + elif readcap and readcap in self.cache: + child = self.cache[readcap] + else: + child = TahoeDirectoryTarget(self.nodeurl, self.cache, + self.progressfunc) + child.init_from_grid(writecap, readcap) + if writecap: + self.cache[writecap] = child + if readcap: + self.cache[readcap] = child + if recurse: + child.populate(True) + self.children[name] = child + + def get_child_target(self, name): + # return a new target for a named subdirectory of this dir + if self.children is None: + self.populate(False) + if name in self.children: + return self.children[name] + writecap = make_tahoe_subdirectory(self.nodeurl, self.writecap, name) + child = TahoeDirectoryTarget(self.nodeurl, self.cache, + self.progressfunc) + child.just_created(writecap) + self.children[name] = child + return child + + def put_file(self, name, inf): + url = self.nodeurl + "uri" + # I'm not sure this will work: we might not have .seek, so if not: + #inf = inf.read() + + # TODO: this always creates immutable files. We might want an option + # to always create mutable files, or to copy mutable files into new + # mutable files. + resp = do_http("PUT", url, inf) + filecap = check_PUT(resp) + self.new_children[name] = filecap + + def put_uri(self, name, filecap): + self.new_children[name] = filecap + + def set_children(self): + if not self.new_children: + return + # XXX TODO t=set_children + +class Copier: + def __init__(self, nodeurl, config, aliases, + verbosity, stdout, stderr, + progressfunc=None): + if nodeurl[-1] != "/": + nodeurl += "/" + self.nodeurl = nodeurl + self.progressfunc = progressfunc + self.config = config + self.aliases = aliases + self.verbosity = verbosity + self.stdout = stdout + self.stderr = stderr + + def to_stderr(self, text): + print >>self.stderr, text + + def do_copy(self, sources, destination): + recursive = self.config["recursive"] + + #print "sources:", sources + #print "dest:", destination + + target = self.get_info(destination) + #print target + + source_info = dict([(self.get_info(source), source) + for source in sources]) + source_files = [s for s in source_info if s[0] == "file"] + source_dirs = [s for s in source_info if s[0] == "directory"] + empty_sources = [s for s in source_info if s[0] == "empty"] + if empty_sources: + for s in empty_sources: + self.to_stderr("no such file or directory %s" % source_info[s]) + return 1 + + #print "source_files", " ".join([source_info[s] for s in source_files]) + #print "source_dirs", " ".join([source_info[s] for s in source_dirs]) + + if source_dirs and not recursive: + self.to_stderr("cannot copy directories without --recursive") + return 1 + + if target[0] == "file": + # cp STUFF foo.txt, where foo.txt already exists. This limits the + # possibilities considerably. + if len(sources) > 1: + self.to_stderr("target '%s' is not a directory" % destination) + return 1 + if source_dirs: + self.to_stderr("cannot copy directory into a file") + return 1 + return self.copy_to_file(source_files[0], target) + + if target[0] == "empty": + if recursive: + return self.copy_to_directory(source_files, source_dirs, target) + if len(sources) > 1: + # if we have -r, we'll auto-create the target directory. Without + # it, we'll only create a file. + self.to_stderr("cannot copy multiple files into a file without -r") + return 1 + # cp file1 newfile + return self.copy_to_file(source_files[0], target) + + if target[0] == "directory": + return self.copy_to_directory(source_files, source_dirs, target) + + self.to_stderr("unknown target") + return 1 + + def get_info(self, target): + rootcap, path = get_alias(self.aliases, target, None) + if rootcap == DefaultAliasMarker: + # this is a local file + pathname = os.path.abspath(os.path.expanduser(path)) + if not os.path.exists(pathname): + name = os.path.basename(pathname) + return ("empty", "local", name, pathname) + if os.path.isdir(pathname): + return ("directory", "local", pathname) + else: + assert os.path.isfile(pathname) + name = os.path.basename(pathname) + return ("file", "local", name, pathname) else: - assert os.path.isfile(pathname) - return ("file", "local", pathname) - else: - # this is a tahoe object - url = nodeurl + "uri/%s" % urllib.quote(rootcap) - if path: - url += "/" + escape_path(path) + # this is a tahoe object + url = self.nodeurl + "uri/%s" % urllib.quote(rootcap) + name = None + if path: + url += "/" + escape_path(path) + last_slash = path.rfind("/") + name = path + if last_slash: + name = path[last_slash+1:] + return self.get_info_tahoe_dirnode(url, name) + + def get_info_tahoe_dirnode(self, url, name): resp = do_http("GET", url + "?t=json") if resp.status == 404: # doesn't exist yet - return ("empty", "tahoe", False, None, None, url) + return ("empty", "tahoe", False, name, None, None, url) parsed = simplejson.loads(resp.read()) nodetype, d = parsed mutable = d.get("mutable", False) # older nodes don't provide 'mutable' rw_uri = ascii_or_none(d.get("rw_uri")) ro_uri = ascii_or_none(d.get("ro_uri")) if nodetype == "dirnode": - return ("directory", "tahoe", mutable, rw_uri, ro_uri, url) + return ("directory", "tahoe", mutable, name, rw_uri, ro_uri, + d["children"], url) else: - return ("file", "tahoe", mutable, rw_uri, ro_uri, url) + return ("file", "tahoe", mutable, name, rw_uri, ro_uri, url) -def copy(nodeurl, config, aliases, sources, destination, - verbosity, stdout, stderr): - if nodeurl[-1] != "/": - nodeurl += "/" - recursive = config["recursive"] - - #print "sources:", sources - #print "dest:", destination - - target = get_info(nodeurl, aliases, destination) - #print target - - source_info = dict([(get_info(nodeurl, aliases, source), source) - for source in sources]) - source_files = [s for s in source_info if s[0] == "file"] - source_dirs = [s for s in source_info if s[0] == "directory"] - empty_sources = [s for s in source_info if s[0] == "empty"] - if empty_sources: - for s in empty_sources: - print >>stderr, "no such file or directory %s" % source_info[s] - return 1 - #print "source_files", " ".join([source_info[s] for s in source_files]) - #print "source_dirs", " ".join([source_info[s] for s in source_dirs]) + def get_file_data(self, source): + assert source[0] == "file" + if source[1] == "local": + (ig1, ig2, name, pathname) = source + return open(pathname, "rb").read() + (ig1, ig2, mutable, name, writecap, readcap, url) = source + return GET_to_string(url) - if source_dirs and not recursive: - print >>stderr, "cannot copy directories without --recursive" - return 1 + def put_file_data(self, data, target): + assert target[0] in ("file", "empty") + if target[1] == "local": + (ig1, ig2, name, pathname) = target + open(pathname, "wb").write(data) + return True + (ig1, ig2, mutable, name, writecap, readcap, url) = target + return PUT(url, data) - if target[0] == "file": - # cp STUFF foo.txt, where foo.txt already exists. This limits the - # possibilities considerably. - if len(sources) > 1: - print >>stderr, "target '%s' is not a directory" % destination - return 1 - if source_dirs: - print >>stderr, "cannot copy directory into a file" - return 1 - return copy_to_file(source_files[0], target) - - if target[0] == "empty": - if recursive: - return copy_to_directory(source_files, source_dirs, target) - if len(sources) > 1: - # if we have -r, we'll auto-create the target directory. Without - # it, we'll only create a file. - print >>stderr, "cannot copy multiple files into a file without -r" - return 1 - # cp file1 newfile - return copy_to_file(source_files[0], target) + def put_uri(self, uri, targeturl): + return PUT(targeturl + "?t=uri", uri) - if target[0] == "directory": - return copy_to_directory(source_files, source_dirs, target) + def upload_data(self, data): + url = self.nodeurl + "uri" + return PUT(url, data) - print >>stderr, "unknown target" - return 1 + def copy_to_file(self, source, target): + assert source[0] == "file" + # do we need to copy bytes? + if source[1] == "local" or source[2] == True or target[1] == "local": + # yes + data = self.get_file_data(source) + self.put_file_data(data, target) + return + # no, we're getting data from an immutable source, and we're copying + # into the tahoe grid, so we can just copy the URI. + uri = source[3] or source[4] # prefer rw_uri, fall back to ro_uri + # TODO: if the original was mutable, and we're creating the target, + # should be we create a mutable file to match? At the moment we always + # create immutable files. + self.put_uri(uri, target[-1]) + def copy_to_directory(self, source_file_infos, source_dir_infos, + target_info): + # step one: build a graph of the source tree. This returns a dictionary, + # with child names as keys, and values that are either Directory or File + # instances (local or tahoe). + source_dirs = self.build_graphs(source_dir_infos) -def get_file_data(source): - assert source[0] == "file" - if source[1] == "local": - return open(source[2], "rb").read() - return do_http("GET", source[-1]).read() + # step two: create the top-level target directory object + assert target_info[0] in ("empty", "directory") + if target_info[1] == "local": + pathname = target_info[-1] + if not os.path.exists(pathname): + os.makedirs(pathname) + assert os.path.isdir(pathname) + target = LocalDirectoryTarget(self.progressfunc, target_info[-1]) + else: + assert target_info[1] == "tahoe" + target = TahoeDirectoryTarget(self.nodeurl, self.cache, + self.progressfunc) + if target_info[0] == "empty": + writecap = mkdir(target_info[-1]) + target.just_created(writecap) + else: + (ig1, ig2, mutable, name, writecap, readcap, url) = target_info + target.init_from_grid(writecap, readcap) -class WriteError(Exception): - pass + # step three: find a target for each source node, creating + # directories as necessary. 'targetmap' is a dictionary that uses + # target Directory instances as keys, and has values of + # (name->sourceobject) dicts for all the files that need to wind up + # there. -def check_PUT(resp): - if resp.status in (200, 201): - return True - raise WriteError("Error during PUT: %s %s %s" % (resp.status, resp.reason, - resp.read())) + # sources are all LocalFile/LocalDirectory/TahoeFile/TahoeDirectory + # target is LocalDirectory/TahoeDirectory -def put_file_data(data, target): - if target[1] == "local": - open(target[2], "wb").write(data) - return True - resp = do_http("PUT", target[-1], data) - return check_PUT(resp) - -def put_uri(uri, target): - resp = do_http("PUT", target[-1] + "?t=uri", uri) - return check_PUT(resp) - -def copy_to_file(source, target): - assert source[0] == "file" - # do we need to copy bytes? - if source[1] == "local" or source[2] == True or target[1] == "local": - # yes - data = get_file_data(source) - put_file_data(data, target) - return - # no, we're getting data from an immutable source, and we're copying into - # the tahoe grid, so we can just copy the URI. - uri = source[3] or source[4] # prefer rw_uri, fall back to ro_uri - # TODO: if the original was mutable, and we're creating the target, - # should be we create a mutable file to match? At the moment we always - # create immutable files. - put_uri(uri, target) - -def copy_to_directory(source_files, source_dirs, target): - NotImplementedError + self.targetmap = {} + self.files_to_copy = 0 + + for source in source_file_infos: + if source[1] == "local": + (ig1, ig2, name, pathname) = source + s = LocalFileSource(pathname) + else: + assert source[1] == "tahoe" + (ig1, ig2, mutable, name, writecap, readcap, url) = source + s = TahoeFileSource(self.nodeurl, mutable, + writecap, readcap) + self.attach_to_target(s, name, target) + self.files_to_copy += 1 + + for source in source_dirs: + self.assign_targets(source, target) + + self.progress("starting copy, %d files, %d directories" % + (self.files_to_copy, len(self.targets))) + self.files_copied = 0 + self.targets_finished = 0 + + # step four: walk through the list of targets. For each one, copy all + # the files. If the target is a TahoeDirectory, upload and create + # read-caps, then do a set_children to the target directory. + + for target in self.targets: + self.copy_files(self.targets[target], target) + self.targets_finished += 1 + self.progress("%d/%d directories" % + (self.targets_finished, len(self.targets))) + + def attach_to_target(self, source, name, target): + if target not in self.targets: + self.targets[target] = {} + self.targets[target][name] = source + self.files_to_copy += 1 + + def assign_targets(self, source, target): + # copy everything in s to the target + assert isinstance(source, (LocalDirectorySource, TahoeDirectorySource)) + + for name, child in source.children.items(): + if isinstance(child, (LocalDirectorySource, TahoeDirectorySource)): + # we will need a target directory for this one + subtarget = target.get_child_target(name) + self.assign_targets(source, subtarget) + else: + assert isinstance(child, (LocalFileSource, TahoeFileSource)) + self.attach_to_target(source, name, target) + + + + def copy_files(self, targetmap, target): + for name, source in targetmap.items(): + assert isinstance(source, (LocalFileSource, TahoeFileSource)) + self.copy_file(source, name, target) + self.files_copied += 1 + self.progress("%d/%d files, %d/%d directories" % + (self.files_copied, self.files_to_copy, + self.targets_finished, len(self.targets))) + target.set_children() + + def need_to_copy_bytes(self, source, target): + if source.need_to_copy_bytes: + # mutable tahoe files, and local files + return True + if isinstance(target, LocalDirectoryTarget): + return True + return False + + def copy_file(self, source, name, target): + assert isinstance(source, (LocalFileSource, TahoeFileSource)) + if self.need_to_copy_bytes(source, target): + # if the target is a local directory, this will just write the + # bytes to disk. If it is a tahoe directory, it will upload the + # data, and stash the new filecap for a later set_children call. + f = source.open() + target.put_file(name, f) + return + # otherwise we're copying tahoe to tahoe, and using immutable files, + # so we can just make a link + target.put_uri(name, source.bestcap()) + + + def progress(self, message): + print message + if self.progressfunc: + self.progressfunc(message) + + def build_graphs(self, sources): + cache = {} + graphs = [] + for source in sources: + assert source[0] == "directory" + if source[1] == "local": + root = LocalDirectorySource(self.progress, source[-1]) + root.populate(True) + else: + assert source[1] == "tahoe" + (ig1, ig2, mutable, name, writecap, readcap, url) = source + root = TahoeDirectorySource(self.nodeurl, cache, self.progress) + root.init_from_grid(writecap, readcap) + root.populate(True) + graphs.append(root) + return graphs + + +def copy(nodeurl, config, aliases, sources, destination, + verbosity, stdout, stderr): + c = Copier(nodeurl, config, aliases, verbosity, stdout, stderr) + return c.do_copy(sources, destination) diff --git a/src/allmydata/test/test_system.py b/src/allmydata/test/test_system.py index fbe34113..390e84d2 100644 --- a/src/allmydata/test/test_system.py +++ b/src/allmydata/test/test_system.py @@ -1603,9 +1603,12 @@ class SystemTest(testutil.SignalMixin, testutil.PollMixin, testutil.StallMixin, datas.append(data) open(fn,"wb").write(data) - def _check_stdout_against((out,err), filenum): + def _check_stdout_against((out,err), filenum=None, data=None): self.failUnlessEqual(err, "") - self.failUnlessEqual(out, datas[filenum]) + if filenum is not None: + self.failUnlessEqual(out, datas[filenum]) + if data is not None: + self.failUnlessEqual(out, data) # test all both forms of put: from a file, and from stdin # tahoe put bar FOO @@ -1778,6 +1781,28 @@ class SystemTest(testutil.SignalMixin, testutil.PollMixin, testutil.StallMixin, d.addCallback(run, "get", "tahoe:file3") d.addCallback(_check_stdout_against, 5) + # recursive copy: setup + dn = os.path.join(self.basedir, "dir1") + os.makedirs(dn) + open(os.path.join(dn, "file1"), "wb").write("file1") + open(os.path.join(dn, "file2"), "wb").write("file2") + open(os.path.join(dn, "file3"), "wb").write("file3") + sdn2 = os.path.join(dn, "subdir2") + os.makedirs(sdn2) + open(os.path.join(dn, "file4"), "wb").write("file4") + open(os.path.join(dn, "file5"), "wb").write("file5") + + # from disk into tahoe + #d.addCallback(run, "cp", "-r", dn, "tahoe:dir1") + #d.addCallback(run, "ls") + #d.addCallback(_check_ls, ["dir1"]) + #d.addCallback(run, "ls", "dir1") + #d.addCallback(_check_ls, ["file1", "file2", "file3", "subdir2"]) + #d.addCallback(run, "ls", "tahoe:dir1/subdir2") + #d.addCallback(_check_ls, ["file4", "file5"]) + #d.addCallback(run, "get", "dir1/subdir2/file4") + #d.addCallback(_check_stdout_against, data="file4") + # tahoe_ls doesn't currently handle the error correctly: it tries to # JSON-parse a traceback. ## def _ls_missing(res): -- 2.45.2