about summary refs log tree commit diff stats
path: root/registry/fetch.go
diff options
context:
space:
mode:
Diffstat (limited to 'registry/fetch.go')
-rw-r--r--registry/fetch.go277
1 files changed, 277 insertions, 0 deletions
diff --git a/registry/fetch.go b/registry/fetch.go
new file mode 100644
index 0000000..9adf4ec
--- /dev/null
+++ b/registry/fetch.go
@@ -0,0 +1,277 @@
+/*
+Copyright (c) 2019 Ben Morrison (gbmor)
+
+This file is part of Registry.
+
+Registry is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+Registry is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Registry.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
+package registry // import "git.sr.ht/~gbmor/getwtxt/registry"
+
+import (
+	"bufio"
+	"bytes"
+	"fmt"
+	"io/ioutil"
+	"net/http"
+	"regexp"
+	"strings"
+	"sync"
+	"time"
+)
+
+// GetTwtxt fetches the raw twtxt file data from the user's
+// provided URL, after validating the URL. If the returned
+// boolean value is false, the fetched URL is a single user's
+// twtxt file. If true, the fetched URL is the output of
+// another registry's /api/plain/tweets. The output of
+// GetTwtxt should be passed to either ParseUserTwtxt or
+// ParseRegistryTwtxt, respectively.
+// Generally, the *http.Client inside a given Registry instance should
+// be passed to GetTwtxt. If the *http.Client passed is nil,
+// Registry will use a preconstructed client with a
+// timeout of 10s and all other values set to default.
+func GetTwtxt(urlKey string, client *http.Client) ([]byte, bool, error) {
+	if !strings.HasPrefix(urlKey, "http://") && !strings.HasPrefix(urlKey, "https://") {
+		return nil, false, fmt.Errorf("invalid URL: %v", urlKey)
+	}
+
+	res, err := doReq(urlKey, "GET", "", client)
+	if err != nil {
+		return nil, false, err
+	}
+	defer res.Body.Close()
+
+	var textPlain bool
+	for _, v := range res.Header["Content-Type"] {
+		if strings.Contains(v, "text/plain") {
+			textPlain = true
+			break
+		}
+	}
+	if !textPlain {
+		return nil, false, fmt.Errorf("received non-text/plain response body from %v", urlKey)
+	}
+
+	if res.StatusCode != http.StatusOK {
+		return nil, false, fmt.Errorf("didn't get 200 from remote server, received %v: %v", res.StatusCode, urlKey)
+	}
+
+	twtxt, err := ioutil.ReadAll(res.Body)
+	if err != nil {
+		return nil, false, fmt.Errorf("error reading response body from %v: %v", urlKey, err)
+	}
+
+	// Signal that we're adding another twtxt registry as a "user"
+	if strings.HasSuffix(urlKey, "/api/plain/tweets") || strings.HasSuffix(urlKey, "/api/plain/tweets/all") {
+		return twtxt, true, nil
+	}
+
+	return twtxt, false, nil
+}
+
+// DiffTwtxt issues a HEAD request on the user's
+// remote twtxt data. It then checks the Content-Length
+// header. If it's different from the stored result of
+// the previous Content-Length header, update the stored
+// value for a given user and return true.
+// Otherwise, return false. In some error conditions,
+// such as the user not being in the registry, it returns true.
+// In other error conditions considered "unrecoverable,"
+// such as the supplied URL being invalid, it returns false.
+func (registry *Registry) DiffTwtxt(urlKey string) (bool, error) {
+	if !strings.HasPrefix(urlKey, "http://") && !strings.HasPrefix(urlKey, "https://") {
+		return false, fmt.Errorf("invalid URL: %v", urlKey)
+	}
+
+	registry.Mu.Lock()
+	user, ok := registry.Users[urlKey]
+	if !ok {
+		return true, fmt.Errorf("user not in registry")
+	}
+
+	user.Mu.Lock()
+
+	defer func() {
+		registry.Users[urlKey] = user
+		user.Mu.Unlock()
+		registry.Mu.Unlock()
+	}()
+
+	res, err := doReq(urlKey, "HEAD", user.LastModified, registry.HTTPClient)
+	if err != nil {
+		return false, err
+	}
+
+	switch res.StatusCode {
+	case http.StatusOK:
+		for _, e := range res.Header["Last-Modified"] {
+			if e != "" {
+				user.LastModified = e
+				break
+			}
+		}
+		return true, nil
+
+	case http.StatusNotModified:
+		return false, nil
+	}
+
+	return false, nil
+}
+
+// internal function. boilerplate for http requests.
+func doReq(urlKey, method, modTime string, client *http.Client) (*http.Response, error) {
+	if client == nil {
+		client = &http.Client{
+			Transport:     nil,
+			CheckRedirect: nil,
+			Jar:           nil,
+			Timeout:       10 * time.Second,
+		}
+	}
+
+	var b []byte
+	buf := bytes.NewBuffer(b)
+	req, err := http.NewRequest(method, urlKey, buf)
+	if err != nil {
+		return nil, err
+	}
+
+	if modTime != "" {
+		req.Header.Set("If-Modified-Since", modTime)
+	}
+
+	res, err := client.Do(req)
+	if err != nil {
+		return nil, fmt.Errorf("couldn't %v %v: %v", method, urlKey, err)
+	}
+
+	return res, nil
+}
+
+// ParseUserTwtxt takes a fetched twtxt file in the form of
+// a slice of bytes, parses it, and returns it as a
+// TimeMap. The output may then be passed to Index.AddUser()
+func ParseUserTwtxt(twtxt []byte, nickname, urlKey string) (TimeMap, error) {
+	var erz []byte
+	if len(twtxt) == 0 {
+		return nil, fmt.Errorf("no data to parse in twtxt file")
+	}
+
+	reader := bytes.NewReader(twtxt)
+	scanner := bufio.NewScanner(reader)
+	timemap := NewTimeMap()
+
+	for scanner.Scan() {
+		nopadding := strings.TrimSpace(scanner.Text())
+		if strings.HasPrefix(nopadding, "#") || nopadding == "" {
+			continue
+		}
+
+		columns := strings.Split(nopadding, "\t")
+		if len(columns) != 2 {
+			return nil, fmt.Errorf("improperly formatted data in twtxt file")
+		}
+
+		normalizedDatestamp := fixTimestamp(columns[0])
+		thetime, err := time.Parse(time.RFC3339, normalizedDatestamp)
+		if err != nil {
+			erz = append(erz, []byte(fmt.Sprintf("unable to retrieve date: %v\n", err))...)
+		}
+
+		timemap[thetime] = nickname + "\t" + urlKey + "\t" + nopadding
+	}
+
+	if len(erz) == 0 {
+		return timemap, nil
+	}
+	return timemap, fmt.Errorf("%v", string(erz))
+}
+
+func fixTimestamp(ts string) string {
+	normalizeTimestamp := regexp.MustCompile(`[\+][\d][\d][:][\d][\d]`)
+	return strings.TrimSpace(normalizeTimestamp.ReplaceAllString(ts, "Z"))
+}
+
+// ParseRegistryTwtxt takes output from a remote registry and outputs
+// the accessible user data via a slice of Users.
+func ParseRegistryTwtxt(twtxt []byte) ([]*User, error) {
+	var erz []byte
+	if len(twtxt) == 0 {
+		return nil, fmt.Errorf("received no data")
+	}
+
+	reader := bytes.NewReader(twtxt)
+	scanner := bufio.NewScanner(reader)
+	userdata := []*User{}
+
+	for scanner.Scan() {
+
+		nopadding := strings.TrimSpace(scanner.Text())
+
+		if strings.HasPrefix(nopadding, "#") || nopadding == "" {
+			continue
+		}
+
+		columns := strings.Split(nopadding, "\t")
+		if len(columns) != 4 {
+			return nil, fmt.Errorf("improperly formatted data")
+		}
+
+		thetime, err := time.Parse(time.RFC3339, columns[2])
+		if err != nil {
+			erz = append(erz, []byte(fmt.Sprintf("%v\n", err))...)
+			continue
+		}
+
+		parsednickname := columns[0]
+		dataIndex := 0
+		parsedurl := columns[1]
+		inIndex := false
+
+		for i, e := range userdata {
+			if e.Nick == parsednickname || e.URL == parsedurl {
+				dataIndex = i
+				inIndex = true
+				break
+			}
+		}
+
+		if inIndex {
+			tmp := userdata[dataIndex]
+			tmp.Status[thetime] = nopadding
+			userdata[dataIndex] = tmp
+		} else {
+			timeNowRFC := time.Now().Format(time.RFC3339)
+			if err != nil {
+				erz = append(erz, []byte(fmt.Sprintf("%v\n", err))...)
+			}
+
+			tmp := &User{
+				Mu:   sync.RWMutex{},
+				Nick: parsednickname,
+				URL:  parsedurl,
+				Date: timeNowRFC,
+				Status: TimeMap{
+					thetime: nopadding,
+				},
+			}
+
+			userdata = append(userdata, tmp)
+		}
+	}
+
+	return userdata, fmt.Errorf("%v", erz)
+}