The aim of the script is to take text from a text file and put it onto a stock video with an ai reading the text. Similar to those reddit stories on social media with parkour minecraft in the background.
import cv2
import time
from ffpyplayer.player import MediaPlayer
from Transcription import newTranscribeAudio
from pydub import AudioSegment
#get a gpt text generation to create a story based on a prompt, for example sci-fi story and spread it over 3-4 parts
#get stock footage, like minecraft parkour etc
#write text of script on the footage
#create video for each part
#have ai voiceover to read the transcript
cap = cv2.VideoCapture("Stock_Videos\Minecraft_Parkour.mp4")
transcription = newTranscribeAudio("final_us.wav")
player = MediaPlayer("final_us.mp3")
audio = AudioSegment.from_file("final_us.mp3")
story = open("Story.txt", "r").read()
story_split = story.split("||")
fps = cap.get(cv2.CAP_PROP_FPS)
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
video_duration = frame_count / fps # Duration of one loop of the video
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
audio_duration = len(audio) / 1000 # Duration in seconds
video_writer = cv2.VideoWriter(f"CompletedVideo.mp4", fourcc, fps, (1080, 1920))
choice = 0#part of the story choice
part_split = story_split[choice].split("")
with open("Segment.txt", "w") as file:
file.write(story_split[choice])
start_time = time.time()
length = len(part_split) - 1
next_text = []
for j in range(0, length):
temp = part_split[j].replace("\n", "")
next_text.append([temp])
index = 0
word_index = 0
frame_size_x = 1080
frame_size_y = 1920
audio_duration = len(audio) / 1000 # Duration in seconds
start_time = time.time()
wait_time = 1 / fps
while (time.time() - start_time) < audio_duration:
cap.set(cv2.CAP_PROP_POS_FRAMES, 0) # Restart video
elapsed_time = time.time() - start_time
print(video_writer)
if index >= len(transcription):
break
while cap.isOpened():
# Capture frames in the video
ret, frame = cap.read()
if not ret:
break
audio_frame, val = player.get_frame()
if val == 'eof': # End of file
print("Audio playback finished.")
break
if index >= len(transcription):
break
if frame_size_x == -1:
frame_size_x = frame.shape[1]
frame_size_y = frame.shape[0]
elapsed_time = time.time() - start_time
# describe the type of font
# to be used.
font = cv2.FONT_HERSHEY_SIMPLEX
trans = transcription[index]["words"]
end_time = trans[word_index]["end"]
if trans[word_index]["start"] < elapsed_time < trans[word_index]["end"]:
video_text = trans[word_index]["text"]
elif elapsed_time >= trans[word_index]["end"]:
#index += 1
word_index += 1
if (word_index >= len(trans)):
index += 1
word_index = 0
# get boundary of this text
textsize = cv2.getTextSize(video_text, font, 3, 6)[0]
# get coords based on boundary
textX = int((frame.shape[1] - textsize[0]) / 2)
textY = int((frame.shape[0] + textsize[1]) / 2)
cv2.putText(frame,
video_text,
(textX, textY),
font, 3,
(0, 255, 255),
6,
cv2.LINE_4)
# Define the resize scale
scale_percent = 50 # Resize to 50% of the original size
# Get new dimensions
width = 1080
height = 1920
new_size = (width, height)
# Resize the frame
resized_frame = cv2.resize(frame, new_size)
video_writer.write(resized_frame)
cv2.imshow('video', resized_frame)
cv2.waitKey(wait_time)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
cv2.destroyAllWindows()
video_writer.release()
cap.release()
When I run this script the audio matches the text in the video perfectly and it runs for the correct amount of time to match with the audio (2 min 44 sec). However, the saved video CompletedVideo.mp4 only lasts for 1 min 10 sec. I am unsure why the video has sped up. The fps is 60 fps. If you require any more information please let me know and thanks in advance.
I have tried changing the fps, changing the wait_time after writing each frame. I am expecting the CompletedVideo.mp4 to be 2 min 44 sec long not 1 min 10 sec long.