|
- #!/usr/bin/env python3
-
- # Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved.
- #
- # NVIDIA CORPORATION and its licensors retain all intellectual property
- # and proprietary rights in and to this software, related documentation
- # and any modifications thereto. Any use, reproduction, disclosure or
- # distribution of this software and related documentation without an express
- # license agreement from NVIDIA CORPORATION is strictly prohibited.
-
- import argparse
- import os
- from pathlib import Path, PurePosixPath
-
- import numpy as np
- import json
- import sys
- import math
- import cv2
- import os
- import shutil
-
- def parse_args():
- parser = argparse.ArgumentParser(description="convert a text colmap export to nerf format transforms.json; optionally convert video to images, and optionally run colmap in the first place")
-
- parser.add_argument("--video_in", default="", help="run ffmpeg first to convert a provided video file into a set of images. uses the video_fps parameter also")
- parser.add_argument("--video_fps", default=2)
- parser.add_argument("--time_slice", default="", help="time (in seconds) in the format t1,t2 within which the images should be generated from the video. eg: \"--time_slice '10,300'\" will generate images only from 10th second to 300th second of the video")
- parser.add_argument("--run_colmap", action="store_true", help="run colmap first on the image folder")
- parser.add_argument("--colmap_matcher", default="sequential", choices=["exhaustive","sequential","spatial","transitive","vocab_tree"], help="select which matcher colmap should use. sequential for videos, exhaustive for adhoc images")
- parser.add_argument("--colmap_db", default="colmap.db", help="colmap database filename")
- parser.add_argument("--images", default="images", help="input path to the images")
- parser.add_argument("--text", default="colmap_text", help="input path to the colmap text files (set automatically if run_colmap is used)")
- parser.add_argument("--aabb_scale", default=16, choices=["1","2","4","8","16"], help="large scene scale factor. 1=scene fits in unit cube; power of 2 up to 16")
- parser.add_argument("--skip_early", default=0, help="skip this many images from the start")
- parser.add_argument("--out", default="transforms.json", help="output path")
- args = parser.parse_args()
- return args
-
- def do_system(arg):
- print(f"==== running: {arg}")
- err = os.system(arg)
- if err:
- print("FATAL: command failed")
- sys.exit(err)
-
- def run_ffmpeg(args):
- if not os.path.isabs(args.images):
- args.images = os.path.join(os.path.dirname(args.video_in), args.images)
- images = args.images
- video = args.video_in
- fps = float(args.video_fps) or 1.0
- print(f"running ffmpeg with input video file={video}, output image folder={images}, fps={fps}.")
- if (input(f"warning! folder '{images}' will be deleted/replaced. continue? (Y/n)").lower().strip()+"y")[:1] != "y":
- sys.exit(1)
- try:
- shutil.rmtree(images)
- except:
- pass
- do_system(f"mkdir {images}")
-
- time_slice_value = ""
- time_slice = args.time_slice
- if time_slice:
- start, end = time_slice.split(",")
- time_slice_value = f",select='between(t\,{start}\,{end})'"
- do_system(f"ffmpeg -i {video} -qscale:v 1 -qmin 1 -vf \"fps={fps}{time_slice_value}\" {images}/%04d.jpg")
-
- def run_colmap(args):
- db=args.colmap_db
- images=args.images
- db_noext=str(Path(db).with_suffix(""))
-
- if args.text=="text":
- args.text=db_noext+"_text"
- text=args.text
- sparse=db_noext+"_sparse"
- print(f"running colmap with:\n\tdb={db}\n\timages={images}\n\tsparse={sparse}\n\ttext={text}")
- if (input(f"warning! folders '{sparse}' and '{text}' will be deleted/replaced. continue? (Y/n)").lower().strip()+"y")[:1] != "y":
- sys.exit(1)
- if os.path.exists(db):
- os.remove(db)
- do_system(f"colmap feature_extractor --ImageReader.camera_model OPENCV --SiftExtraction.estimate_affine_shape=true --SiftExtraction.domain_size_pooling=true --ImageReader.single_camera 1 --database_path {db} --image_path {images}")
- do_system(f"colmap {args.colmap_matcher}_matcher --SiftMatching.guided_matching=true --database_path {db}")
- try:
- shutil.rmtree(sparse)
- except:
- pass
- do_system(f"mkdir {sparse}")
- do_system(f"colmap mapper --database_path {db} --image_path {images} --output_path {sparse}")
- do_system(f"colmap bundle_adjuster --input_path {sparse}/0 --output_path {sparse}/0 --BundleAdjustment.refine_principal_point 1")
- try:
- shutil.rmtree(text)
- except:
- pass
- do_system(f"mkdir {text}")
- do_system(f"colmap model_converter --input_path {sparse}/0 --output_path {text} --output_type TXT")
-
- def variance_of_laplacian(image):
- return cv2.Laplacian(image, cv2.CV_64F).var()
-
- def sharpness(imagePath):
- image = cv2.imread(imagePath)
- gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
- fm = variance_of_laplacian(gray)
- return fm
-
- def qvec2rotmat(qvec):
- return np.array([
- [
- 1 - 2 * qvec[2]**2 - 2 * qvec[3]**2,
- 2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3],
- 2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2]
- ], [
- 2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3],
- 1 - 2 * qvec[1]**2 - 2 * qvec[3]**2,
- 2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1]
- ], [
- 2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2],
- 2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1],
- 1 - 2 * qvec[1]**2 - 2 * qvec[2]**2
- ]
- ])
-
- def rotmat(a, b):
- a, b = a / np.linalg.norm(a), b / np.linalg.norm(b)
- v = np.cross(a, b)
- c = np.dot(a, b)
- s = np.linalg.norm(v)
- kmat = np.array([[0, -v[2], v[1]], [v[2], 0, -v[0]], [-v[1], v[0], 0]])
- return np.eye(3) + kmat + kmat.dot(kmat) * ((1 - c) / (s ** 2 + 1e-10))
-
- def closest_point_2_lines(oa, da, ob, db): # returns point closest to both rays of form o+t*d, and a weight factor that goes to 0 if the lines are parallel
- da = da / np.linalg.norm(da)
- db = db / np.linalg.norm(db)
- c = np.cross(da, db)
- denom = np.linalg.norm(c)**2
- t = ob - oa
- ta = np.linalg.det([t, db, c]) / (denom + 1e-10)
- tb = np.linalg.det([t, da, c]) / (denom + 1e-10)
- if ta > 0:
- ta = 0
- if tb > 0:
- tb = 0
- return (oa+ta*da+ob+tb*db) * 0.5, denom
-
- if __name__ == "__main__":
- args = parse_args()
- if args.video_in != "":
- run_ffmpeg(args)
- if args.run_colmap:
- run_colmap(args)
- AABB_SCALE = int(args.aabb_scale)
- SKIP_EARLY = int(args.skip_early)
- IMAGE_FOLDER = args.images
- TEXT_FOLDER = args.text
- OUT_PATH = args.out
- print(f"outputting to {OUT_PATH}...")
- with open(os.path.join(TEXT_FOLDER,"cameras.txt"), "r") as f:
- angle_x = math.pi / 2
- for line in f:
- # 1 SIMPLE_RADIAL 2048 1536 1580.46 1024 768 0.0045691
- # 1 OPENCV 3840 2160 3178.27 3182.09 1920 1080 0.159668 -0.231286 -0.00123982 0.00272224
- # 1 RADIAL 1920 1080 1665.1 960 540 0.0672856 -0.0761443
- if line[0] == "#":
- continue
- els = line.split(" ")
- w = float(els[2])
- h = float(els[3])
- fl_x = float(els[4])
- fl_y = float(els[4])
- k1 = 0
- k2 = 0
- p1 = 0
- p2 = 0
- cx = w / 2
- cy = h / 2
- if els[1] == "SIMPLE_PINHOLE":
- cx = float(els[5])
- cy = float(els[6])
- elif els[1] == "PINHOLE":
- fl_y = float(els[5])
- cx = float(els[6])
- cy = float(els[7])
- elif els[1] == "SIMPLE_RADIAL":
- cx = float(els[5])
- cy = float(els[6])
- k1 = float(els[7])
- elif els[1] == "RADIAL":
- cx = float(els[5])
- cy = float(els[6])
- k1 = float(els[7])
- k2 = float(els[8])
- elif els[1] == "OPENCV":
- fl_y = float(els[5])
- cx = float(els[6])
- cy = float(els[7])
- k1 = float(els[8])
- k2 = float(els[9])
- p1 = float(els[10])
- p2 = float(els[11])
- else:
- print("unknown camera model ", els[1])
- # fl = 0.5 * w / tan(0.5 * angle_x);
- angle_x = math.atan(w / (fl_x * 2)) * 2
- angle_y = math.atan(h / (fl_y * 2)) * 2
- fovx = angle_x * 180 / math.pi
- fovy = angle_y * 180 / math.pi
-
- print(f"camera:\n\tres={w,h}\n\tcenter={cx,cy}\n\tfocal={fl_x,fl_y}\n\tfov={fovx,fovy}\n\tk={k1,k2} p={p1,p2} ")
-
- with open(os.path.join(TEXT_FOLDER,"images.txt"), "r") as f:
- i = 0
- bottom = np.array([0.0, 0.0, 0.0, 1.0]).reshape([1, 4])
- out = {
- "camera_angle_x": angle_x,
- "camera_angle_y": angle_y,
- "fl_x": fl_x,
- "fl_y": fl_y,
- "k1": k1,
- "k2": k2,
- "p1": p1,
- "p2": p2,
- "cx": cx,
- "cy": cy,
- "w": w,
- "h": h,
- "aabb_scale": AABB_SCALE,
- "frames": [],
- }
-
- up = np.zeros(3)
- for line in f:
- line = line.strip()
- if line[0] == "#":
- continue
- i = i + 1
- if i < SKIP_EARLY*2:
- continue
- if i % 2 == 1:
- elems=line.split(" ") # 1-4 is quat, 5-7 is trans, 9ff is filename (9, if filename contains no spaces)
- #name = str(PurePosixPath(Path(IMAGE_FOLDER, elems[9])))
- # why is this requireing a relitive path while using ^
- image_rel = os.path.relpath(IMAGE_FOLDER)
- name = str(f"./{image_rel}/{'_'.join(elems[9:])}")
- b=sharpness(name)
- print(name, "sharpness=",b)
- image_id = int(elems[0])
- qvec = np.array(tuple(map(float, elems[1:5])))
- tvec = np.array(tuple(map(float, elems[5:8])))
- R = qvec2rotmat(-qvec)
- t = tvec.reshape([3,1])
- m = np.concatenate([np.concatenate([R, t], 1), bottom], 0)
- c2w = np.linalg.inv(m)
- c2w[0:3,2] *= -1 # flip the y and z axis
- c2w[0:3,1] *= -1
- c2w = c2w[[1,0,2,3],:] # swap y and z
- c2w[2,:] *= -1 # flip whole world upside down
-
- up += c2w[0:3,1]
-
- frame={"file_path":name,"sharpness":b,"transform_matrix": c2w}
- out["frames"].append(frame)
- nframes = len(out["frames"])
- up = up / np.linalg.norm(up)
- print("up vector was", up)
- R = rotmat(up,[0,0,1]) # rotate up vector to [0,0,1]
- R = np.pad(R,[0,1])
- R[-1, -1] = 1
-
-
- for f in out["frames"]:
- f["transform_matrix"] = np.matmul(R, f["transform_matrix"]) # rotate up to be the z axis
-
- # find a central point they are all looking at
- print("computing center of attention...")
- totw = 0.0
- totp = np.array([0.0, 0.0, 0.0])
- for f in out["frames"]:
- mf = f["transform_matrix"][0:3,:]
- for g in out["frames"]:
- mg = g["transform_matrix"][0:3,:]
- p, w = closest_point_2_lines(mf[:,3], mf[:,2], mg[:,3], mg[:,2])
- if w > 0.01:
- totp += p*w
- totw += w
- totp /= totw
- print(totp) # the cameras are looking at totp
- for f in out["frames"]:
- f["transform_matrix"][0:3,3] -= totp
-
- avglen = 0.
- for f in out["frames"]:
- avglen += np.linalg.norm(f["transform_matrix"][0:3,3])
- avglen /= nframes
- print("avg camera distance from origin", avglen)
- for f in out["frames"]:
- f["transform_matrix"][0:3,3] *= 4.0 / avglen # scale to "nerf sized"
-
- for f in out["frames"]:
- f["transform_matrix"] = f["transform_matrix"].tolist()
- print(nframes,"frames")
- print(f"writing {OUT_PATH}")
- with open(OUT_PATH, "w") as outfile:
- json.dump(out, outfile, indent=2)
|