major internal datastructure overhaul. major memory improvements. Will need to make a cache cleaning mechanism now

This commit is contained in:
Ward Wouts 2004-10-14 11:46:31 +00:00
parent 014598d807
commit f54d8cb0b4

View file

@ -1,4 +1,4 @@
# $Dwarf: article.rb,v 1.88 2004/06/16 08:16:23 ward Exp $
# $Dwarf: article.rb,v 1.89 2004/09/01 11:25:46 ward Exp $
# $Source$
#
@ -22,6 +22,7 @@ require 'net/nntp'
require 'news/newsrc'
require 'tempfile'
require 'timeout'
#require 'yaml'
class ArticleError < RuntimeError; end
class TempError < ArticleError; end
@ -29,26 +30,37 @@ class PermError < ArticleError; end
class Article
Debuglevel = 0
Debuglevel = 1
Message = Struct.new(:messid, :id, :server, :subject)
def initialize(nntpservers, groupname, newsrc="~/.newsrc")
@messids = []
@ids = []
@servers = []
@subjects = []
@messageinfo = []
@sorted = false
@grouped = false
@groups = {}
@gotten = {}
@group = groupname
@preselectpatterns = []
@newids = {}
@serverlist = nntpservers.split('|')
@connections = {}
@serverlist.collect{|server|
@connections[server] = {}
@newids[server] = {}
begin
p server
p Time.now
begin
timeout(180) do
@connections[server]["nntp"] = Net::NNTP.new(server)
end
rescue TimeoutError
puts "Timeout due to sucky server, reconnecting"
retry
end
p Time.now
@connections[server]["skip_ids"] = Set::IntSpan.new()
@connections[server]["newsrc"] = News::Newsrc.new("#{newsrc}.#{server}")
set_skip_ids(server, @connections[server]["newsrc"].marked_articles(@group))
@ -80,16 +92,41 @@ def reconnect(server)
print "Succesfully reconnected to #{server}\n"
end
def memusage
print "memprof:\n"
print "global:\n"
# for i in global_variables
# print "#{i}\n"
# end
# print "local:\n"
# for i in local_variables
# print "#{i}\n"
# end
for i in self.instance_variables
puts i
print "X: "
begin
puts self.instance_eval(i).size
rescue NoMethodError
end
end
end
def add_preselect_pattern(regexp)
@preselectpatterns.push(regexp)
end
def preselect(subject)
@preselectpatterns.collect{|regexp|
if regexp.match(subject) != nil
return true
end
}
return false
end
def add(messid, id, server, subject)
# print "Messid: #{messid}\n"
# print "Id: #{id}\n"
# print "Server: #{server}\n"
# print "Subject: #{subject}\n"
@messids.push(messid)
@ids.push(id.to_i)
@servers.push(server)
@subjects.push(subject)
@sorted = false
@messageinfo.push(Message.new(messid, id.to_i, server, subject))
@grouped = false
end
@ -169,6 +206,7 @@ def get_articles(cachedir=false)
for id in art.keys
if art[id].has_key?("subject") and art[id].has_key?("messid")
print "adding: #{art[id]["messid"]}, #{id}, #{server}, #{art[id]["subject"]}\n" if Debuglevel > 2
@newids[server][id.to_i] = true
add(art[id]["messid"], id, server, art[id]["subject"])
end
end
@ -268,31 +306,6 @@ def get_xhdr(server, range, header)
end
end
# if xhdr doesn't work, this should be used
# for i in (range.diff(@connections[server]["skip_ids"]).elements)
# begin
# @connections[server]["nntp"].stat(i)
# resp, id, messid, list = @connections[server]["nntp"].head(i)
# for j in list
# if j =~ /Subject: (.*)/
# subj=$1
# end
# end
# print "get_articles messid: #{messid}\n" if Debuglevel > 1
# print "get_articles id: #{id}\n" if Debuglevel > 1
# print "get_articles server: #{server}\n" if Debuglevel > 1
# print "get_articles subject: #{subj}\n" if Debuglevel > 1
# add(messid, id, server, subj)
# rescue Net::NNTP::RuntimeError
# print "whoopsie couldn't stat #{i}\n" if Debuglevel > 1
# end
# end
def get_groups
group_subjects unless @grouped
return @groups
end
def get_groupname
return @group
end
@ -344,29 +357,31 @@ end
def get_group_body(subj)
result = []
group_subject_sort(subj)
return false if @groups[subj]["messages"] == nil
for i in (0...@groups[subj]["messages"].length)
unless @gotten.has_key?(@groups[subj]["messages"][i])
# puts @groups[subj].to_yaml
return false if @groups[subj]["messageinfo"] == nil
for i in (0...@groups[subj]["messageinfo"].length)
unless @gotten.has_key?(@groups[subj]["messageinfo"][i][:messid])
print "getting article: #{i}\n" if Debuglevel > 1
print "getting article: #{subj}\n" if Debuglevel > 1
print "full subject: #{@groups[subj]["subject"][i]}\n" if Debuglevel > 0
print "message id: #{@groups[subj]["messages"][i]}\n" if Debuglevel > 1
print "id: #{@groups[subj]["ids"][i]}\n" if Debuglevel > 1
print "server: #{@groups[subj]["servers"][i]}\n" if Debuglevel > 0
print "full subject: #{@groups[subj]["messageinfo"][i][:subject]}\n" if Debuglevel > 0
print "message id: #{@groups[subj]["messageinfo"][i][:messid]}\n" if Debuglevel > 1
print "id: #{@groups[subj]["messageinfo"][i][:id]}\n" if Debuglevel > 1
print "server: #{@groups[subj]["messageinfo"][i][:server]}\n" if Debuglevel > 0
resp = false
while resp == false
if @serverlist.include?(@groups[subj]["servers"][i])
resp, id, messid, list = get_body(@groups[subj]["servers"][i], @groups[subj]["messages"][i])
if @serverlist.include?(@groups[subj]["messageinfo"][i][:server])
resp, id, messid, list = get_body(@groups[subj]["messageinfo"][i][:server], @groups[subj]["messageinfo"][i][:messid])
else
resp = false
end
if resp == false
if Debuglevel > 1
print "mess-id i: #{@groups[subj]["messages"][i]}\n"
print "mess-id i+1: #{@groups[subj]["messages"][i+1]}\n"
# XXX dit moet netter kunnen
print "mess-id i+1: #{@groups[subj]["messageinfo"][i+1][:messid]}\n" if @groups[subj]["messageinfo"][i+1] != nil
end
if (i+1 < @groups[subj]["messages"].length) and
(@groups[subj]["messages"][i] == @groups[subj]["messages"][i+1])
if (i+1 < @groups[subj]["messageinfo"].length) and
(@groups[subj]["messageinfo"][i][:messid] == @groups[subj]["messageinfo"][i+1][:messid])
print " Trying next server...\n"
i += 1
else
@ -374,7 +389,7 @@ def get_group_body(subj)
end
end
end
@gotten[ @groups[subj]["messages"][i] ] = true
@gotten[ @groups[subj]["messageinfo"][i][:messid] ] = true
result = list
end
end
@ -384,23 +399,24 @@ end
def get_group_body_first(subj)
group_subject_sort(subj)
i = 0
unless @groups[subj]["messages"] != nil && @groups[subj]["messages"][0]
unless @groups[subj]["messageinfo"] != nil && @groups[subj]["messageinfo"][0][:messid]
return false
end
while @gotten.has_key?(@groups[subj]["messages"][0]) == false
while @gotten.has_key?(@groups[subj]["messageinfo"][0][:messid]) == false
print "getting article: #{subj}\n" if Debuglevel > 0
print "full subject: #{@groups[subj]["subject"][0]}\n" if Debuglevel > 0
print "message id: #{@groups[subj]["messages"][i]}\n" if Debuglevel > 1
print "id: #{@groups[subj]["ids"][i]}\n" if Debuglevel > 1
print "server: #{@groups[subj]["servers"][0]}\n" if Debuglevel > 0
print "full subject: #{@groups[subj]["messageinfo"][0][:subject]}\n" if Debuglevel > 0
print "message id: #{@groups[subj]["messageinfo"][i][:messid]}\n" if Debuglevel > 1
print "id: #{@groups[subj]["messageinfo"][i][:id]}\n" if Debuglevel > 1
print "server: #{@groups[subj]["messageinfo"][0][:server]}\n" if Debuglevel > 0
resp = false
while resp == false
resp, id, messid, list = get_body(@groups[subj]["servers"][i], @groups[subj]["messages"][i])
resp, id, messid, list = get_body(@groups[subj]["messageinfo"][i][:server], @groups[subj]["messageinfo"][i][:messid])
if resp == false
print "mess-id i: #{@groups[subj]["messages"][i]}\n"
print "mess-id i+1: #{@groups[subj]["messages"][i+1]}\n"
if (i+1 < @groups[subj]["messages"].length) and
(@groups[subj]["messages"][i] == @groups[subj]["messages"][i+1])
print "mess-id i: #{@groups[subj]["messageinfo"][i][:messid]}\n"
# XXX dit moet netter kunnen
print "mess-id i+1: #{@groups[subj]["messageinfo"][i+1][:messid]}\n" if @groups[subj]["messageinfo"][i+1] != nil
if (i+1 < @groups[subj]["messageinfo"].length) and
(@groups[subj]["messageinfo"][i][:messid] == @groups[subj]["messageinfo"][i+1][:messid])
print "Trying next server...\n"
i += 1
else
@ -408,29 +424,31 @@ def get_group_body_first(subj)
end
end
end
@gotten[@groups[subj]["messages"][i]] = true
@gotten[@groups[subj]["messageinfo"][i][:messid]] = true
end
return list
end
def get_group_body_rest(subj, file=nil)
result = []
for i in (1...@groups[subj]["messages"].length)
unless @gotten.has_key?(@groups[subj]["messages"][i])
for i in (1...@groups[subj]["messageinfo"].length)
unless @gotten.has_key?(@groups[subj]["messageinfo"][i][:messid])
print "getting article: #{i}\n" if Debuglevel > 1
print "getting article: #{subj}\n" if Debuglevel > 1
print "full subject: #{@groups[subj]["subject"][i]}\n" if Debuglevel > 0
print "message id: #{@groups[subj]["messages"][i]}\n" if Debuglevel > 1
print "id: #{@groups[subj]["ids"][i]}\n" if Debuglevel > 1
print "server: #{@groups[subj]["servers"][i]}\n" if Debuglevel > 0
print "full subject: #{@groups[subj]["messageinfo"][i][:subject]}\n" if Debuglevel > 0
print "message id: #{@groups[subj]["messageinfo"][i][:messid]}\n" if Debuglevel > 1
print "id: #{@groups[subj]["messageinfo"][i][:id]}\n" if Debuglevel > 1
print "server: #{@groups[subj]["messageinfo"][i][:server]}\n" if Debuglevel > 0
resp = false
while resp == false
resp, id, messid, list = get_body(@groups[subj]["servers"][i], @groups[subj]["messages"][i])
resp, id, messid, list = get_body(@groups[subj]["messageinfo"][i][:server], @groups[subj]["messages"][i])
if resp == false
print "mess-id i: #{@groups[subj]["messages"][i]}\n"
print "mess-id i+1: #{@groups[subj]["messages"][i+1]}\n"
if (i+1 < @groups[subj]["messages"].length) and
(@groups[subj]["messages"][i] == @groups[subj]["messages"][i+1])
print "mess-id i: #{@groups[subj]["messageinfo"][i][:messid]}\n"
# print "mess-id i+1: #{@groups[subj]["messageinfo"][i+1][:messid]}\n"
# XXX dit moet netter kunnen
print "mess-id i+1: #{@groups[subj]["messageinfo"][i+1][:messid]}\n" if @groups[subj]["messageinfo"][i+1] != nil
if (i+1 < @groups[subj]["messageinfo"].length) and
(@groups[subj]["messageinfo"][i][:messid] == @groups[subj]["messageinfo"][i+1][:messid])
print "Trying next server...\n"
i += 1
else
@ -438,7 +456,7 @@ def get_group_body_rest(subj, file=nil)
end
end
end
@gotten[ @groups[subj]["messages"][i] ] = true
@gotten[ @groups[subj]["messageinfo"][i][:messid] ] = true
if file
list.collect{|line| file.print "#{line}\n"}
else
@ -454,16 +472,15 @@ def get_group_subjects
return @groups.keys
end
def get_group_messids(subject)
group_subjects unless @grouped
return @groups[subject]["messages"]
end
def group_is_complete(subj)
group_subjects unless @grouped
#print "Subject: #{subj}\n"
print "length: #{@groups[subj]["messages"].length} total: #{@groups[subj]["total"].to_i}\n" if Debuglevel > 1
umessids = @groups[subj]["messages"].uniq
print "length: #{@groups[subj]["messageinfo"].length} total: #{@groups[subj]["total"].to_i}\n" if Debuglevel > 1
messids = []
@groups[subj]["messageinfo"].each {|x|
messids.push(x[:messid])
}
umessids = messids.uniq
if (umessids.length ) >= @groups[subj]["total"].to_i
return true
else
@ -479,39 +496,25 @@ def group_is_multipart(subj)
@groups[subj]["total"].to_i > 1
end
def get_messids
return @messids
end
def get_subjects
return @subjects
end
def group_subjects
@groups = {}
for i in (0...@subjects.length)
print "group subjects: #{i} #{@subjects[i]}\n" if Debuglevel > 3
if @subjects[i] =~ /(.*)\((\d+)\/(\d+)\)(.*)/ || @subjects[i] =~ /(.*)\[(\d+)\/(\d+)\](.*)/
for i in (0...@messageinfo.length)
print "group subjects: #{i} #{@messageinfo[i][:subject]}\n" if Debuglevel > 3
if @messageinfo[i][:subject] =~ /(.*)\((\d+)\/(\d+)\)(.*)/ || @messageinfo[i][:subject] =~ /(.*)\[(\d+)\/(\d+)\](.*)/
j = "#{$1}#{$4} (#{$3})"
number = $2
total = $3
else
j = @subjects[i]
j = @messageinfo[i][:subject]
number = 1
total = 1
end
if @groups.has_key?(j) and number.to_i != 0
@groups[j]["messages"].push(@messids[i])
@groups[j]["ids"].push(@ids[i].to_i)
@groups[j]["servers"].push(@servers[i])
@groups[j]["subject"].push(@subjects[i])
@groups[j]["messageinfo"].push(@messageinfo[i])
elsif number.to_i != 0
@groups[j] = {}
@groups[j]["total"] = total
@groups[j]["messages"] = [ @messids[i] ]
@groups[j]["ids"] = [ @ids[i].to_i ]
@groups[j]["servers"] = [ @servers[i] ]
@groups[j]["subject"] = [ @subjects[i] ]
@groups[j]["messageinfo"] = [ (@messageinfo[i]) ]
end
end
@grouped = true
@ -528,9 +531,9 @@ end
def group_update_newsrc(subject)
print "running group_update_newsrc\n";
for i in (0...@groups[subject]["messages"].length)
if @connections[@groups[subject]["servers"][i]]
@connections[@groups[subject]["servers"][i]]["newsrc"].mark(@group, @groups[subject]["ids"][i])
for i in (0...@groups[subject]["messageinfo"].length)
if @connections[@groups[subject]["messageinfo"][i][:server]]
@connections[@groups[subject]["messageinfo"][i][:server]]["newsrc"].mark(@group, @groups[subject]["messageinfo"][i][:id])
end
end
end
@ -548,8 +551,12 @@ def check_cache(cachedir)
end
def read_cache(cachedir)
p "reading cache"
p Time.now
filename = "#{cachedir}/#{@group}.ripnewscache"
excludes = {}
# id | messageid | subject
lineregexp = Regexp.new('^(\d+)\|(.*?)\|(.*)$')
for server in @connections.keys
excludes[server] = {}
@connections[server]["skip_ids"].elements.collect!{|x| excludes[server][x]=true}
@ -558,11 +565,14 @@ def read_cache(cachedir)
lines = file.readlines
lines.collect{|line|
# id | messageid | subject
if line =~ /^(\d+)\|(.*?)\|(.*)$/
#if line =~ /^(\d+)\|(.*?)\|(.*)$/
if lineregexp.match(line) != nil
unless excludes.has_key?(server) and excludes[server].has_key?($1.to_i) or
$1.to_i < @connections[server]["first"].to_i or
$1.to_i > @connections[server]["last"].to_i
if preselect($3)
add($2, $1, server, $3)
end
@connections[server]["skip_ids"].insert($1.to_i)
end
end
@ -570,24 +580,28 @@ def read_cache(cachedir)
file.close
end
end
p Time.now
memusage
end
def save_cache(cachedir, server)
p "writing cache"
p Time.now
filename = "#{cachedir}/#{@group}.ripnewscache"
if FileTest.directory?( cachedir )
file = File.new( "#{filename}.#{server}.new", "w" ) or print "couldn't open cachefile for writing\n"
if ! File.copy("#{filename}.#{server}","#{filename}.#{server}.new")
puts "Couldn't renew cache"
end
file = File.new( "#{filename}.#{server}.new", "a+" ) or print "couldn't open cachefile for writing\n"
print "Updating cache...\n"
cache = []
for i in (0...@subjects.length)
cache.push("#{@ids[i]}|#{@messids[i]}|#{@subjects[i]}\n") if @servers[i] == server
for i in (0...@messageinfo.length)
if @newids[server].has_key?(@messageinfo[i][:id])
cache.push("#{@messageinfo[i][:id]}|#{@messageinfo[i][:messid]}|#{@messageinfo[i][:subject]}\n") if @messageinfo[i][:server] == server
end
end
cache.sort!
file.print cache
# for i in (0...@subjects.length)
# if @servers[i] == server
# file.print "#{@ids[i]}|#{@messids[i]}|#{@subjects[i]}\n"
# end
# end
file.close
if ( File.move("#{filename}.#{server}.new", "#{filename}.#{server}") )
print "Cache updated for #{server}\n"
@ -595,6 +609,7 @@ def save_cache(cachedir, server)
print "Couldn't update #{server} cache\n"
end
end
p Time.now
end
###############################################################
@ -615,54 +630,49 @@ end
###############################################################
def group_subject_sort(subj)
# XXX Waarom gebruik ik hier eigenlijk sort_arr ipv in place sorting?
#print "Sorting articles\n"
serverhash = {}
for i in (0...@serverlist.length)
serverhash[@serverlist[i]] = i
end
sort_arr = []
for i in (0...@groups[subj]["subject"].length)
print "subj sort #{@groups[subj]["subject"][i]}\n" if Debuglevel > 2
print "subj sort #{@groups[subj]["messages"][i]}\n" if Debuglevel > 2
print "subj sort #{@groups[subj]["ids"][i]}\n" if Debuglevel > 2
print "subj sort #{@groups[subj]["servers"][i]}\n" if Debuglevel > 2
sort_arr.push( [
@groups[subj]["subject"][i].dup,
@groups[subj]["messages"][i].dup,
#@groups[subj]["ids"][i].dup, # eng hoor, dit wijzigen
@groups[subj]["ids"][i],
@groups[subj]["servers"][i].dup
] ) if serverhash[@groups[subj]["servers"][i]] != nil
for i in (0...@groups[subj]["messageinfo"].length)
print "subj sort #{@groups[subj]["messageinfo"][i][:subject]}\n" if Debuglevel > 2
print "subj sort #{@groups[subj]["messageinfo"][i][:messid]}\n" if Debuglevel > 2
print "subj sort #{@groups[subj]["messageinfo"][i][:id]}\n" if Debuglevel > 2
print "subj sort #{@groups[subj]["messageinfo"][i][:server]}\n" if Debuglevel > 2
sort_arr.push(
@groups[subj]["messageinfo"][i].dup
) if serverhash[@groups[subj]["messageinfo"][i][:server]] != nil
end
sort_arr.sort!{|a,b|
r = ward_sort(a[0], b[0])
if serverhash[a[3]] == nil or serverhash[b[3]] == nil
print "serverhash[a[3]]: #{serverhash[a[3]]}\n"
print "serverhash[b[3]]: #{serverhash[b[3]]}\n"
print "a[3]: #{a[3]}\n"
print "b[3]: #{a[3]}\n"
r = ward_sort(a[:subject], b[:subject])
if serverhash[a[:server]] == nil or serverhash[b[:server]] == nil
print "serverhash[a[:server]]: #{serverhash[a[:server]]}\n"
print "serverhash[b[:server]]: #{serverhash[b[:server]]}\n"
print "a[:server]: #{a[:server]}\n"
print "b[:server]: #{a[:server]}\n"
print "strange things going on here...\n"
end
if r == 0
r = serverhash[a[3]] <=> serverhash[b[3]]
r = serverhash[a[:server]] <=> serverhash[b[:server]]
end
r
}
# XXX hmmmm lijkt er op dat @groups[x]["total"] hier wel gesloopt wordt...
# XXX maakt blijkbaar niet uit, maar is niet netjes
@groups[subj].clear
sort_arr.collect{|i|
if @groups[subj].has_key?("messages")
@groups[subj]["subject"].push(i[0])
@groups[subj]["messages"].push(i[1])
@groups[subj]["ids"].push(i[2])
@groups[subj]["servers"].push(i[3])
@groups[subj]["messageinfo"].push(i)
else
@groups[subj]["subject"] = [i[0]]
@groups[subj]["messages"] = [i[1]]
@groups[subj]["ids"] = [i[2]]
@groups[subj]["servers"] = [i[3]]
@groups[subj]["messageinfo"] = [ i ]
end
print "subject sort: #{i[0]}\n" if Debuglevel > 2
print "server: #{i[3]}\n" if Debuglevel > 2
print "subject sort: #{i[:subject]}\n" if Debuglevel > 2
print "server: #{i[:server]}\n" if Debuglevel > 2
}
#print "Done sorting\n"
end
@ -719,10 +729,7 @@ end
def quit
# just testing if these should be reset...
@messids = []
@ids = []
@servers = []
@subjects = []
@messageinfo = []
for server in @connections.keys
begin